From ec0280be113b860f9cb6fec50d77a59e35bf25d2 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 11 Jul 2012 11:06:30 -0700 Subject: [PATCH 01/15] Rename gather/scatter_base_offsets functions to *factored_based_offsets*. No functional change; just preparation for having a path that doesn't factor the offsets into constant and varying parts, which will be better for AVX2 and KNC. --- builtins/target-generic-common.ll | 8 +- builtins/util.m4 | 156 +++++++-------- examples/intrinsics/generic-16.h | 52 ++--- examples/intrinsics/generic-32.h | 52 ++--- examples/intrinsics/generic-64.h | 52 ++--- examples/intrinsics/knc.h | 40 ++-- examples/intrinsics/sse4.h | 40 ++-- opt.cpp | 322 +++++++++++++++--------------- 8 files changed, 361 insertions(+), 361 deletions(-) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 48e7b836..c54dd948 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -334,18 +334,18 @@ define void @__masked_store_blend_double(* nocapture, ;; gather/scatter define(`gather_scatter', ` -declare @__gather_base_offsets32_$1(i8 * nocapture, , +declare @__gather_factored_base_offsets32_$1(i8 * nocapture, , i32, , ) nounwind readonly -declare @__gather_base_offsets64_$1(i8 * nocapture, , +declare @__gather_factored_base_offsets64_$1(i8 * nocapture, , i32, , ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly -declare void @__scatter_base_offsets32_$1(i8* nocapture, , +declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, , i32, , , ) nounwind -declare void @__scatter_base_offsets64_$1(i8* nocapture, , +declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, , i32, , , ) nounwind declare void @__scatter32_$1(, , ) nounwind diff --git a/builtins/util.m4 b/builtins/util.m4 index ce25a761..4a8822bb 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1599,7 +1599,7 @@ declare void @__pseudo_masked_store_double( * nocapture, @__pseudo_gather64_float(, ) declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i8(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i16(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i32(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_float(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i64(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets32_double(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i8(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i16(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i32(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_float(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i64(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , ) nounwind readonly -declare @__pseudo_gather_base_offsets64_double(i8 *, , i32, , +declare @__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined @@ -1660,7 +1660,7 @@ declare @__pseudo_gather_base_offsets64_double(i8 *, , , , , ) nounwind declare void @__pseudo_scatter64_double(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , , ) nounwind -declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, , i32, , +declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , , ) nounwind declare float @__log_uniform_float(float) nounwind readnone @@ -1872,103 +1872,103 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__usedouble( %g64_d) %pgbo32_8 = call - @__pseudo_gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use8( %pgbo32_8) %pgbo32_16 = call - @__pseudo_gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use16( %pgbo32_16) %pgbo32_32 = call - @__pseudo_gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use32( %pgbo32_32) %pgbo32_f = call - @__pseudo_gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usefloat( %pgbo32_f) %pgbo32_64 = call - @__pseudo_gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use64( %pgbo32_64) %pgbo32_d = call - @__pseudo_gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usedouble( %pgbo32_d) %gbo32_8 = call - @__gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use8( %gbo32_8) %gbo32_16 = call - @__gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use16( %gbo32_16) %gbo32_32 = call - @__gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use32( %gbo32_32) %gbo32_f = call - @__gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usefloat( %gbo32_f) %gbo32_64 = call - @__gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use64( %gbo32_64) %gbo32_d = call - @__gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usedouble( %gbo32_d) %pgbo64_8 = call - @__pseudo_gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use8( %pgbo64_8) %pgbo64_16 = call - @__pseudo_gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use16( %pgbo64_16) %pgbo64_32 = call - @__pseudo_gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use32( %pgbo64_32) %pgbo64_f = call - @__pseudo_gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usefloat( %pgbo64_f) %pgbo64_64 = call - @__pseudo_gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use64( %pgbo64_64) %pgbo64_d = call - @__pseudo_gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usedouble( %pgbo64_d) %gbo64_8 = call - @__gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use8( %gbo64_8) %gbo64_16 = call - @__gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use16( %gbo64_16) %gbo64_32 = call - @__gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use32( %gbo64_32) %gbo64_f = call - @__gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usefloat( %gbo64_f) %gbo64_64 = call - @__gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use64( %gbo64_64) %gbo64_d = call - @__gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, + @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usedouble( %gbo64_d) @@ -2003,56 +2003,56 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i64( %v64, %v64, %mask) call void @__scatter64_double( %v64, %vd, %mask) - call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) - call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %v16, %mask) - call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %v32, %mask) - call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %vf, %mask) - call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %v64, %mask) - call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %vd, %mask) - call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %v8, %mask) - call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %v16, %mask) - call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %v32, %mask) - call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %vf, %mask) - call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %v64, %mask) - call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) - call void @__scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) - call void @__scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %v16, %mask) - call void @__scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %v32, %mask) - call void @__scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %vf, %mask) - call void @__scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %v64, %mask) - call void @__scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + call void @__scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %vd, %mask) - call void @__scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %v8, %mask) - call void @__scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %v16, %mask) - call void @__scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %v32, %mask) - call void @__scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %vf, %mask) - call void @__scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %v64, %mask) - call void @__scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) ret void @@ -3245,7 +3245,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o } -define @__gather_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3276,7 +3276,7 @@ define @__gather_base_offsets32_$1(i8 * %ptr, %offset ret %ret`'eval(WIDTH-1) } -define @__gather_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3391,7 +3391,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s ret void } -define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... @@ -3401,7 +3401,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 ret void } -define void @__scatter_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 1851ff7e..c18e9fbe 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1306,7 +1306,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec16_i1 mask) { \ @@ -1322,18 +1322,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ @@ -1361,7 +1361,7 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec16_i1 mask) { \ @@ -1375,18 +1375,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 628aab84..c1f89cd8 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1374,7 +1374,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec32_i1 mask) { \ @@ -1390,18 +1390,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \ @@ -1429,7 +1429,7 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec32_i1 mask) { \ @@ -1443,18 +1443,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2630e306..2a54446e 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1507,7 +1507,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ __vec64_i1 mask) { \ @@ -1523,18 +1523,18 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ } -GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32) -GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float) -GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float) -GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64) -GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double) -GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float) +GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double) +GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \ @@ -1562,7 +1562,7 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double) // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec64_i1 mask) { \ @@ -1576,18 +1576,18 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } -SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32) -SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float) -SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float) -SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64) -SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double) -SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double) +SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index eceeb885..fb11db11 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1940,7 +1940,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) /* static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ @@ -1958,7 +1958,7 @@ static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ */ static FORCEINLINE __vec16_i32 -__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, +__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i1 mask) { __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); @@ -1973,7 +1973,7 @@ __gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, } static FORCEINLINE __vec16_f -__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, +__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i1 mask) { __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); @@ -1987,13 +1987,13 @@ __gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, return ret; } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) +GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* @@ -2039,7 +2039,7 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) */ // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) +#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) /* static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ @@ -2054,16 +2054,16 @@ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ } */ -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) +SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) static FORCEINLINE void -__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, +__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, uint32_t scale, __vec16_i32 constOffset, __vec16_i32 val, __vec16_i1 mask) { @@ -2072,7 +2072,7 @@ __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, } static FORCEINLINE void -__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, +__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, uint32_t scale, const __vec16_i32 &constOffset, const __vec16_f &val, const __vec16_i1 mask) { diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index fcc14618..088b694d 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3007,84 +3007,84 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE __vec4_i8 -__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, +__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i8 -__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, +__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i16 -__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, +__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i16 - __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, + __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, +__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, +__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, +__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, +__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, constOffset, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, +__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, constOffset, mask); @@ -3252,7 +3252,7 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ -__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ +__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ uint32_t scale, __vec4_i32 constOffset, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ @@ -3281,7 +3281,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ } \ } \ static FORCEINLINE void \ -__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ +__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ uint32_t scale, __vec4_i64 constOffset, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ @@ -3322,7 +3322,7 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void -__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, +__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); @@ -3359,7 +3359,7 @@ __scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, } static FORCEINLINE void -__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, +__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); @@ -3396,17 +3396,17 @@ __scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE void -__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, +__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_d val, __vec4_i1 mask) { - __scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); + __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); } static FORCEINLINE void -__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, +__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, uint32_t scale, __vec4_i64 constOffset, __vec4_d val, __vec4_i1 mask) { - __scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); + __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); } diff --git a/opt.cpp b/opt.cpp index 1456dfd7..1140c9ce 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1689,57 +1689,57 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { }; GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_i8", "__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather32_i16", "__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather32_float", "__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather32_double", "__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather32_i8", "__pseudo_gather_factored_base_offsets32_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_scatter_factored_base_offsets32_double", false), - GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather64_i16", "__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather64_float", "__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather64_double", "__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather64_i8", "__pseudo_gather_factored_base_offsets64_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_scatter_factored_base_offsets32_double", false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1858,57 +1858,57 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { }; GSBOInfo gsFuncs[] = { - GSBOInfo("__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSBOInfo("__pseudo_gather_factored_base_offsets32_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSBOInfo( "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), + GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_scatter_factored_base_offsets32_double", false), - GSBOInfo( "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), + GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", + "__pseudo_gather_factored_base_offsets32_i8", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", + "__pseudo_gather_factored_base_offsets32_i16", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", + "__pseudo_gather_factored_base_offsets32_i32", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_float", + "__pseudo_gather_factored_base_offsets32_float", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", + "__pseudo_gather_factored_base_offsets32_i64", true), + GSBOInfo("__pseudo_gather_factored_base_offsets64_double", + "__pseudo_gather_factored_base_offsets32_double", true), - GSBOInfo( "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_scatter_factored_base_offsets32_i8", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_scatter_factored_base_offsets32_i16", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_scatter_factored_base_offsets32_i32", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_scatter_factored_base_offsets32_float", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_scatter_factored_base_offsets32_i64", false), + GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_scatter_factored_base_offsets32_double", false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -2025,29 +2025,29 @@ lGSToLoadStore(llvm::CallInst *callInst) { }; GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__masked_load_i8", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8", "__masked_load_i8", LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__masked_load_i16", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16", "__masked_load_i16", LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__masked_load_i32", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32", "__masked_load_i32", LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_float", "__masked_load_float", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_float", "__masked_load_float", LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__masked_load_i64", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64", "__masked_load_i64", LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets32_double", "__masked_load_double", + GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", LLVMTypes::DoubleType, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__masked_load_i8", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8", "__masked_load_i8", LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__masked_load_i16", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16", "__masked_load_i16", LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__masked_load_i32", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32", "__masked_load_i32", LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_float", "__masked_load_float", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_float", "__masked_load_float", LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__masked_load_i64", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64", "__masked_load_i64", LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", + GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", LLVMTypes::DoubleType, 8) }; @@ -2067,29 +2067,29 @@ lGSToLoadStore(llvm::CallInst *callInst) { }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_float", "__pseudo_masked_store_float", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets32_double", "__pseudo_masked_store_double", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8", "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_float", "__pseudo_masked_store_float", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_double", "__pseudo_masked_store_double", + ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8) }; @@ -3354,10 +3354,10 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GatherCoalescePass"); llvm::Function *gatherFuncs[] = { - m->module->getFunction("__pseudo_gather_base_offsets32_i32"), - m->module->getFunction("__pseudo_gather_base_offsets32_float"), - m->module->getFunction("__pseudo_gather_base_offsets64_i32"), - m->module->getFunction("__pseudo_gather_base_offsets64_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"), }; int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]); @@ -3367,7 +3367,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { // Iterate over all of the instructions and look for calls to - // __pseudo_gather_base_offsets{32,64}_{i32,float} calls. + // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls. llvm::CallInst *callInst = llvm::dyn_cast(&*iter); if (callInst == NULL) continue; @@ -3639,19 +3639,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) { }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true), LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), @@ -3667,19 +3667,19 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), - LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), @@ -3899,12 +3899,12 @@ bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { "__fast_masked_vload", - "__gather_base_offsets32_i8", "__gather_base_offsets32_i16", - "__gather_base_offsets32_i32", "__gather_base_offsets32_i64", - "__gather_base_offsets32_float", "__gather_base_offsets32_double", - "__gather_base_offsets64_i8", "__gather_base_offsets64_i16", - "__gather_base_offsets64_i32", "__gather_base_offsets64_i64", - "__gather_base_offsets64_float", "__gather_base_offsets64_double", + "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double", + "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double", "__gather32_i8", "__gather32_i16", "__gather32_i32", "__gather32_i64", "__gather32_float", "__gather32_double", @@ -3926,12 +3926,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__masked_store_blend_i8", "__masked_store_blend_i16", "__masked_store_blend_i32", "__masked_store_blend_i64", "__masked_store_blend_float", "__masked_store_blend_double", - "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", - "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", - "__scatter_base_offsets32_float", "__scatter_base_offsets32_double", - "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16", - "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64", - "__scatter_base_offsets64_float", "__scatter_base_offsets64_double", + "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double", "__scatter_elt32_i8", "__scatter_elt32_i16", "__scatter_elt32_i32", "__scatter_elt32_i64", "__scatter_elt32_float", "__scatter_elt32_double", From 10b79fb41b71caeb94b40f4bff0f6efb7fef0c4a Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 11 Jul 2012 14:09:06 -0700 Subject: [PATCH 02/15] Add support for non-factored variants of gather/scatter functions. We now have two ways of approaching gather/scatters with a common base pointer and with offset vectors. For targets with native gather/scatter, we just turn those into base + {1/2/4/8}*offsets. For targets without, we turn those into base + {1/2/4/8}*varying_offsets + const_offsets, where const_offsets is a compile-time constant. Infrastructure for issue #325. --- builtins/util.m4 | 482 +++++++++++++++++++++------ ispc.cpp | 1 + ispc.h | 8 +- opt.cpp | 830 +++++++++++++++++++++++++++++++++-------------- 4 files changed, 965 insertions(+), 356 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 4a8822bb..26466b1f 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double( * nocapture, * nocapture, @__pseudo_gather32_i8(, ) nounwind readonly declare @__pseudo_gather32_i16(, ) nounwind readonly @@ -1621,31 +1606,106 @@ declare @__pseudo_gather64_float(, ) declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , - ) nounwind readonly +; The ImproveMemoryOps optimization pass finds these calls and then +; tries to convert them to be calls to gather functions that take a uniform +; base pointer and then a varying integer offset, when possible. +; +; For targets without a native gather instruction, it is best to factor the +; integer offsets like "{1/2/4/8} * varying_offset + constant_offset", +; where varying_offset includes non-compile time constant values, and +; constant_offset includes compile-time constant values. (The scalar loads +; generated in turn can then take advantage of the free offsetting and scale by +; 1/2/4/8 that is offered by the x86 addresisng modes.) +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; int{32,64} offsets, uniform int32 offset_scale, +; int{32,64} offset_delta, mask) +; +; For targets with a gather instruction, it is better to just factor them into +; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the +; offsets are int32/64 vectors. +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; uniform int32 offset_scale, int{32,64} offsets, mask) -declare @__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , - ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets32_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_double(i8 *, i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets64_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_double(i8 *, i32, , + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -1657,16 +1717,6 @@ declare @__pseudo_gather_factored_base_offsets64_double(i8 *, < ; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) ; void __pseudo_scatter_double(varying double *, varying double values, mask) ; -; The GatherScatterFlattenOpt optimization pass also finds these and -; transforms them to scatters like: -; -; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, -; varying int32 offsets, uniform int32 offset_scale, -; varying int{32,64} offset_delta, varying int8 values, mask) -; (and similarly for 16/32/64 bit values) -; -; And the GSImprovementsPass in turn converts these to actual native -; scatters or masked stores. declare void @__pseudo_scatter32_i8(, , ) nounwind declare void @__pseudo_scatter32_i16(, , ) nounwind @@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(, , , , ) nounwind declare void @__pseudo_scatter64_double(, , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , - , ) nounwind +; And the ImproveMemoryOps optimization pass also finds these and +; either transforms them to scatters like: +; +; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, +; varying int32 offsets, uniform int32 offset_scale, +; varying int{32,64} offset_delta, varying int8 values, mask) +; (and similarly for 16/32/64 bit values) +; +; Or, if the target has a native scatter instruction: +; +; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, +; uniform int32 offset_scale, varying int{32,64} offsets, +; varying int8 values, mask) +; (and similarly for 16/32/64 bit values) -declare void @__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , - , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , + , ) nounwind declare float @__log_uniform_float(float) nounwind readnone declare @__log_varying_float() nounwind readnone @@ -1871,6 +1986,109 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %mask) call void @__usedouble( %g64_d) +ifelse(HAVE_GATHER, `1', +` + %nfpgbo32_8 = call + @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfpgbo32_8) + %nfpgbo32_16 = call + @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfpgbo32_16) + %nfpgbo32_32 = call + @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfpgbo32_32) + %nfpgbo32_f = call + @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfpgbo32_f) + %nfpgbo32_64 = call + @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfpgbo32_64) + %nfpgbo32_d = call + @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfpgbo32_d) + + %nfpgbo64_8 = call + @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfpgbo64_8) + %nfpgbo64_16 = call + @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfpgbo64_16) + %nfpgbo64_32 = call + @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfpgbo64_32) + %nfpgbo64_f = call + @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfpgbo64_f) + %nfpgbo64_64 = call + @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfpgbo64_64) + %nfpgbo64_d = call + @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfpgbo64_d) + + %nfgbo32_8 = call + @__gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfgbo32_8) + %nfgbo32_16 = call + @__gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfgbo32_16) + %nfgbo32_32 = call + @__gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfgbo32_32) + %nfgbo32_f = call + @__gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfgbo32_f) + %nfgbo32_64 = call + @__gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfgbo32_64) + %nfgbo32_d = call + @__gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfgbo32_d) + + %nfgbo64_8 = call + @__gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfgbo64_8) + %nfgbo64_16 = call + @__gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfgbo64_16) + %nfgbo64_32 = call + @__gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfgbo64_32) + %nfgbo64_f = call + @__gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfgbo64_f) + %nfgbo64_64 = call + @__gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfgbo64_64) + %nfgbo64_d = call + @__gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfgbo64_d) +', +` %pgbo32_8 = call @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) @@ -1896,32 +2114,6 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v32, %mask) call void @__usedouble( %pgbo32_d) - %gbo32_8 = call - @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use8( %gbo32_8) - %gbo32_16 = call - @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use16( %gbo32_16) - %gbo32_32 = call - @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use32( %gbo32_32) - %gbo32_f = call - @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usefloat( %gbo32_f) - %gbo32_64 = call - @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use64( %gbo32_64) - %gbo32_d = call - @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usedouble( %gbo32_d) - - %pgbo64_8 = call @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) @@ -1947,6 +2139,31 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v64, %mask) call void @__usedouble( %pgbo64_d) + %gbo32_8 = call + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %gbo32_8) + %gbo32_16 = call + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %gbo32_16) + %gbo32_32 = call + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %gbo32_32) + %gbo32_f = call + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usefloat( %gbo32_f) + %gbo32_64 = call + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %gbo32_64) + %gbo32_d = call + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usedouble( %gbo32_d) + %gbo64_8 = call @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) @@ -1970,7 +2187,8 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %gbo64_d = call @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) - call void @__usedouble( %gbo64_d) + call void @__usedouble( %pgbo64_d) +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatters @@ -2003,6 +2221,61 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i64( %v64, %v64, %mask) call void @__scatter64_double( %v64, %vd, %mask) +ifelse(HAVE_SCATTER, `1', +` + call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) + + call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) +', +` call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %v8, %mask) call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, @@ -2054,6 +2327,7 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, %v64, %mask) call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %vd, %mask) +') ret void } diff --git a/ispc.cpp b/ispc.cpp index 2b98de86..0980c3d2 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -212,6 +212,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, // This is the case for most of them t->hasHalf = t->hasRand = t->hasTranscendentals = false; + t->hasGather = t->hasScatter = false; if (!strcasecmp(isa, "sse2")) { t->isa = Target::SSE2; diff --git a/ispc.h b/ispc.h index 9632f514..66191844 100644 --- a/ispc.h +++ b/ispc.h @@ -252,9 +252,15 @@ struct Target { conversions. */ bool hasHalf; - /** Indicates whether there is an ISA random number instruciton. */ + /** Indicates whether there is an ISA random number instruction. */ bool hasRand; + /** Indicates whether the target has a native gather instruction */ + bool hasGather; + + /** Indicates whether the target has a native scatter instruction */ + bool hasScatter; + /** Indicates whether the target has support for transcendentals (beyond sqrt, which we assume that all of them handle). */ bool hasTranscendentals; diff --git a/opt.cpp b/opt.cpp index 1140c9ce..824c5bc7 100644 --- a/opt.cpp +++ b/opt.cpp @@ -225,7 +225,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, } -#if 0 static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2, llvm::Value *arg3, const char *name, @@ -234,7 +233,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::ArrayRef newArgArray(&args[0], &args[4]); return llvm::CallInst::Create(func, newArgArray, name, insertBefore); } -#endif static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, @@ -1673,6 +1671,39 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr, } +/** Check to see if the single offset vector can safely be represented with + 32-bit values. If so, return true and update the pointed-to + llvm::Value * to be the 32-bit equivalent. */ +static bool +lOffsets32BitSafe(llvm::Value **offsetPtr, + llvm::Instruction *insertBefore) { + llvm::Value *offset = *offsetPtr; + + if (offset->getType() == LLVMTypes::Int32VectorType) + return true; + + llvm::SExtInst *sext = llvm::dyn_cast(offset); + if (sext != NULL && + sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) { + // sext of a 32-bit vector -> the 32-bit vector is good + *offsetPtr = sext->getOperand(0); + return true; + } + else if (lVectorIs32BitInts(offset)) { + // The only constant vector we should have here is a vector of + // all zeros (i.e. a ConstantAggregateZero, but just in case, + // do the more general check with lVectorIs32BitInts(). + *offsetPtr = + new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, + LLVMGetName(offset, "_trunc"), + insertBefore); + return true; + } + else + return false; +} + + static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) { struct GSInfo { @@ -1689,57 +1720,153 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { }; GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_i8", "__pseudo_gather_factored_base_offsets32_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSInfo("__pseudo_gather32_i16", "__pseudo_gather_factored_base_offsets32_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSInfo("__pseudo_gather32_i32", "__pseudo_gather_factored_base_offsets32_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSInfo("__pseudo_gather32_float", "__pseudo_gather_factored_base_offsets32_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSInfo("__pseudo_gather32_i64", "__pseudo_gather_factored_base_offsets32_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSInfo("__pseudo_gather32_double", "__pseudo_gather_factored_base_offsets32_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSInfo("__pseudo_gather32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_factored_base_offsets32_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_factored_base_offsets32_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_factored_base_offsets32_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_factored_base_offsets32_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_factored_base_offsets32_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_factored_base_offsets32_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSInfo("__pseudo_scatter32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), - GSInfo("__pseudo_gather64_i8", "__pseudo_gather_factored_base_offsets64_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSInfo("__pseudo_gather64_i16", "__pseudo_gather_factored_base_offsets64_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSInfo("__pseudo_gather64_i32", "__pseudo_gather_factored_base_offsets64_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSInfo("__pseudo_gather64_float", "__pseudo_gather_factored_base_offsets64_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSInfo("__pseudo_gather64_i64", "__pseudo_gather_factored_base_offsets64_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSInfo("__pseudo_gather64_double", "__pseudo_gather_factored_base_offsets64_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSInfo("__pseudo_gather64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_factored_base_offsets64_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_factored_base_offsets64_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_factored_base_offsets64_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_factored_base_offsets64_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_factored_base_offsets64_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_factored_base_offsets64_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSInfo("__pseudo_scatter64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1771,25 +1898,6 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { // to the next instruction... return false; - // Try to decompose the offset vector into a compile time constant - // component and a varying component. The constant component is - // passed as a separate parameter to the gather/scatter functions, - // which in turn allows their implementations to end up emitting - // x86 instructions with constant offsets encoded in them. - llvm::Value *constOffset, *variableOffset; - lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, - callInst); - if (constOffset == NULL) - constOffset = LLVMIntAsType(0, offsetVector->getType()); - if (variableOffset == NULL) - variableOffset = LLVMIntAsType(0, offsetVector->getType()); - - // See if the varying component is scaled by 2, 4, or 8. If so, - // extract that scale factor and rewrite variableOffset to remove - // it. (This also is pulled out so that we can match the scales by - // 2/4/8 offered by x86 addressing operators.) - llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); - // Cast the base pointer to a void *, since that's what the // __pseudo_*_base_offsets_* functions want. basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, @@ -1798,43 +1906,107 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *gatherScatterFunc = info->baseOffsetsFunc; - // If we're doing 32-bit addressing on a 64-bit target, here we - // will see if we can call one of the 32-bit variants of the pseudo - // gather/scatter functions. - if (g->opt.force32BitAddressing && - lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { - gatherScatterFunc = info->baseOffsets32Func; - } + if ((info->isGather == true && g->target.hasGather) || + (info->isGather == false && g->target.hasScatter)) { + // See if the offsets are scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite the offsets to remove + // it. + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector); - if (info->isGather) { - llvm::Value *mask = callInst->getArgOperand(1); + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&offsetVector, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } - // Generate a new function call to the next pseudo gather - // base+offsets instruction. Note that we're passing a NULL - // llvm::Instruction to llvm::CallInst::Create; this means that - // the instruction isn't inserted into a basic block and that - // way we can then call ReplaceInstWithInst(). - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, mask, callInst->getName().str().c_str(), - NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, + mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, + offsetVector, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } else { - llvm::Value *storeValue = callInst->getArgOperand(1); - llvm::Value *mask = callInst->getArgOperand(2); + // Try to decompose the offset vector into a compile time constant + // component and a varying component. The constant component is + // passed as a separate parameter to the gather/scatter functions, + // which in turn allows their implementations to end up emitting + // x86 instructions with constant offsets encoded in them. + llvm::Value *constOffset, *variableOffset; + lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, + callInst); + if (constOffset == NULL) + constOffset = LLVMIntAsType(0, offsetVector->getType()); + if (variableOffset == NULL) + variableOffset = LLVMIntAsType(0, offsetVector->getType()); - // Generate a new function call to the next pseudo scatter - // base+offsets instruction. See above for why passing NULL - // for the Instruction * is intended. - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, storeValue, mask, "", NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + // See if the varying component is scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite variableOffset to remove + // it. (This also is pulled out so that we can match the scales by + // 2/4/8 offered by x86 addressing operators.) + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); + + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } + + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } - return true; } @@ -1858,57 +2030,67 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { }; GSBOInfo gsFuncs[] = { - GSBOInfo("__pseudo_gather_factored_base_offsets32_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_factored_base_offsets32_double", - "__pseudo_gather_factored_base_offsets32_double", true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSBOInfo( "__pseudo_scatter_factored_base_offsets32_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets32_double", - "__pseudo_scatter_factored_base_offsets32_double", false), - - GSBOInfo( "__pseudo_gather_factored_base_offsets64_i8", - "__pseudo_gather_factored_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i16", - "__pseudo_gather_factored_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i32", - "__pseudo_gather_factored_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_float", - "__pseudo_gather_factored_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_i64", - "__pseudo_gather_factored_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_factored_base_offsets64_double", - "__pseudo_gather_factored_base_offsets32_double", true), - - GSBOInfo( "__pseudo_scatter_factored_base_offsets64_i8", - "__pseudo_scatter_factored_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i16", - "__pseudo_scatter_factored_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i32", - "__pseudo_scatter_factored_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_float", - "__pseudo_scatter_factored_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_i64", - "__pseudo_scatter_factored_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_factored_base_offsets64_double", - "__pseudo_scatter_factored_base_offsets32_double", false), + GSBOInfo( g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1991,6 +2173,26 @@ lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, } +static llvm::Constant * +lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) { + llvm::ConstantInt *offsetScaleInt = + llvm::dyn_cast(offsetScale); + Assert(offsetScaleInt != NULL); + uint64_t scaleValue = offsetScaleInt->getZExtValue(); + + std::vector scales; + for (int i = 0; i < g->target.vectorWidth; ++i) { + if (vecType == LLVMTypes::Int64VectorType) + scales.push_back(LLVMInt64(scaleValue)); + else { + Assert(vecType == LLVMTypes::Int32VectorType); + scales.push_back(LLVMInt32((int32_t)scaleValue)); + } + } + return llvm::ConstantVector::get(scales); +} + + /** After earlier optimization passes have run, we are sometimes able to determine that gathers/scatters are actually accessing memory in a more regular fashion and then change the operation to something simpler and @@ -2011,7 +2213,7 @@ lGSToLoadStore(llvm::CallInst *callInst) { struct GatherImpInfo { GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a) - : align(a) { + : align(a), isFactored(!g->target.hasGather) { pseudoFunc = m->module->getFunction(pName); loadMaskedFunc = m->module->getFunction(lmName); Assert(pseudoFunc != NULL && loadMaskedFunc != NULL); @@ -2022,39 +2224,52 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *loadMaskedFunc; llvm::Type *scalarType; const int align; + const bool isFactored; }; GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets32_double", "__masked_load_double", - LLVMTypes::DoubleType, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_factored_base_offsets64_double", "__masked_load_double", - LLVMTypes::DoubleType, 8) + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), }; struct ScatterImpInfo { ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a) - : align(a) { + : align(a), isFactored(!g->target.hasScatter) { pseudoFunc = m->module->getFunction(pName); maskedStoreFunc = m->module->getFunction(msName); vecPtrType = vpt; @@ -2064,33 +2279,46 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *maskedStoreFunc; llvm::Type *vecPtrType; const int align; + const bool isFactored; }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets32_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_factored_base_offsets64_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8) + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -2118,34 +2346,45 @@ lGSToLoadStore(llvm::CallInst *callInst) { lGetSourcePosFromMetadata(callInst, &pos); llvm::Value *base = callInst->getArgOperand(0); - llvm::Value *varyingOffsets = callInst->getArgOperand(1); - llvm::Value *offsetScale = callInst->getArgOperand(2); - llvm::Value *constOffsets = callInst->getArgOperand(3); - llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL; - llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); + llvm::Value *fullOffsets = NULL; + llvm::Value *storeValue = NULL; + llvm::Value *mask = NULL; - // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets - llvm::ConstantInt *offsetScaleInt = - llvm::dyn_cast(offsetScale); - Assert(offsetScaleInt != NULL); - uint64_t scaleValue = offsetScaleInt->getZExtValue(); + if ((gatherInfo != NULL && gatherInfo->isFactored) || + (scatterInfo != NULL && scatterInfo->isFactored)) { + llvm::Value *varyingOffsets = callInst->getArgOperand(1); + llvm::Value *offsetScale = callInst->getArgOperand(2); + llvm::Value *constOffsets = callInst->getArgOperand(3); + if (scatterInfo) + storeValue = callInst->getArgOperand(4); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); - std::vector scales; - for (int i = 0; i < g->target.vectorWidth; ++i) { - if (varyingOffsets->getType() == LLVMTypes::Int64VectorType) - scales.push_back(LLVMInt64(scaleValue)); - else - scales.push_back(LLVMInt32((int32_t)scaleValue)); + // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets + llvm::Constant *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, varyingOffsets->getType()); + + llvm::Value *scaledVarying = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + varyingOffsets, "scaled_varying", callInst); + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, + constOffsets, "varying+const_offsets", + callInst); } - llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales); + else { + if (scatterInfo) + storeValue = callInst->getArgOperand(3); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4); - llvm::Value *scaledVarying = - llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, - varyingOffsets, "scaled_varying", callInst); - llvm::Value *fullOffsets = - llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, - constOffsets, "varying+const_offsets", - callInst); + llvm::Value *offsetScale = callInst->getArgOperand(1); + llvm::Value *offsets = callInst->getArgOperand(2); + llvm::Value *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, offsets->getType()); + + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + offsets, "scaled_offsets", callInst); + } Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str()); @@ -3631,7 +3870,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { : isGather(ig) { pseudoFunc = m->module->getFunction(pName); actualFunc = m->module->getFunction(aName); - Assert(pseudoFunc != NULL && actualFunc != NULL); } llvm::Function *pseudoFunc; llvm::Function *actualFunc; @@ -3639,20 +3877,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true), - - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true), - LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), @@ -3667,19 +3891,57 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", + "__gather_factored_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", + "__gather_factored_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", + "__gather_factored_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", + "__gather_factored_base_offsets32_double", true), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", + "__gather_factored_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", + "__gather_factored_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", + "__gather_factored_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", + "__gather_factored_base_offsets64_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets32_i8", + "__gather_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i16", + "__gather_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i32", + "__gather_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets32_float", + "__gather_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i64", + "__gather_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets32_double", + "__gather_base_offsets32_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets64_i8", + "__gather_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i16", + "__gather_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i32", + "__gather_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets64_float", + "__gather_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i64", + "__gather_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets64_double", + "__gather_base_offsets64_double", true), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), @@ -3694,6 +3956,59 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false), LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", + "__scatter_factored_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", + "__scatter_factored_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", + "__scatter_factored_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", + "__scatter_factored_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", + "__scatter_factored_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", + "__scatter_factored_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", + "__scatter_factored_base_offsets64_double", false), + + + LowerGSInfo("__pseudo_scatter_base_offsets32_i8", + "__scatter_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i16", + "__scatter_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i32", + "__scatter_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_float", + "__scatter_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i64", + "__scatter_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_double", + "__scatter_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_base_offsets64_i8", + "__scatter_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i16", + "__scatter_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i32", + "__scatter_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_float", + "__scatter_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i64", + "__scatter_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_double", + "__scatter_base_offsets64_double", false), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -3709,6 +4024,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) { if (info == NULL) return false; + Assert(info->actualFunc != NULL); // Get the source position from the metadata attached to the call // instruction so that we can issue PerformanceWarning()s below. @@ -3905,6 +4221,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double", + "__gather_base_offsets32_i8", "__gather_base_offsets32_i16", + "__gather_base_offsets32_i32", "__gather_base_offsets32_i64", + "__gather_base_offsets32_float", "__gather_base_offsets32_double", + "__gather_base_offsets64_i8", "__gather_base_offsets64_i16", + "__gather_base_offsets64_i32", "__gather_base_offsets64_i64", + "__gather_base_offsets64_float", "__gather_base_offsets64_double", "__gather32_i8", "__gather32_i16", "__gather32_i32", "__gather32_i64", "__gather32_float", "__gather32_double", @@ -3932,6 +4254,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double", + "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", + "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", + "__scatter_base_offsets32_float", "__scatter_base_offsets32_double", + "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16", + "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64", + "__scatter_base_offsets64_float", "__scatter_base_offsets64_double", "__scatter_elt32_i8", "__scatter_elt32_i16", "__scatter_elt32_i32", "__scatter_elt32_i64", "__scatter_elt32_float", "__scatter_elt32_double", From c09c87873eca8d2d8dffc96486af41dccaf2f50c Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 11 Jul 2012 14:09:17 -0700 Subject: [PATCH 03/15] Whitespace / indentation fixes. --- builtins/util.m4 | 61 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 26466b1f..974c799c 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1949,41 +1949,41 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__usedouble( %pg64_d) %g32_8 = call @__gather32_i8( %v32, - %mask) + %mask) call void @__use8( %g32_8) %g32_16 = call @__gather32_i16( %v32, - %mask) + %mask) call void @__use16( %g32_16) %g32_32 = call @__gather32_i32( %v32, - %mask) + %mask) call void @__use32( %g32_32) %g32_f = call @__gather32_float( %v32, - %mask) + %mask) call void @__usefloat( %g32_f) %g32_64 = call @__gather32_i64( %v32, - %mask) + %mask) call void @__use64( %g32_64) %g32_d = call @__gather32_double( %v32, - %mask) + %mask) call void @__usedouble( %g32_d) %g64_8 = call @__gather64_i8( %v64, - %mask) + %mask) call void @__use8( %g64_8) %g64_16 = call @__gather64_i16( %v64, - %mask) + %mask) call void @__use16( %g64_16) %g64_32 = call @__gather64_i32( %v64, - %mask) + %mask) call void @__use32( %g64_32) %g64_f = call @__gather64_float( %v64, - %mask) + %mask) call void @__usefloat( %g64_f) %g64_64 = call @__gather64_i64( %v64, - %mask) + %mask) call void @__use64( %g64_64) %g64_d = call @__gather64_double( %v64, - %mask) + %mask) call void @__usedouble( %g64_d) ifelse(HAVE_GATHER, `1', @@ -2166,27 +2166,27 @@ ifelse(HAVE_GATHER, `1', %gbo64_8 = call @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__use8( %gbo64_8) %gbo64_16 = call @__gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__use16( %gbo64_16) %gbo64_32 = call @__gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__use32( %gbo64_32) %gbo64_f = call @__gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__usefloat( %gbo64_f) %gbo64_64 = call @__gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__use64( %gbo64_64) %gbo64_d = call @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, - %v64, %mask) + %v64, %mask) call void @__usedouble( %pgbo64_d) ') @@ -2303,35 +2303,36 @@ ifelse(HAVE_SCATTER, `1', %vd, %mask) call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, - %v8, %mask) + %v8, %mask) call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, - %v16, %mask) + %v16, %mask) call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, - %v32, %mask) + %v32, %mask) call void @__scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, - %vf, %mask) + %vf, %mask) call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, - %v64, %mask) + %v64, %mask) call void @__scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, - %vd, %mask) + %vd, %mask) call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, - %v8, %mask) + %v8, %mask) call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, - %v16, %mask) + %v16, %mask) call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, - %v32, %mask) + %v32, %mask) call void @__scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, - %vf, %mask) + %vf, %mask) call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, - %v64, %mask) + %v64, %mask) call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, - %vd, %mask) + %vd, %mask) ') ret void } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops From 216ac4b1a484653a56993dbb46ac233deb263134 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 11 Jul 2012 14:52:14 -0700 Subject: [PATCH 04/15] Stop factoring out constant offsets for gather/scatter if instr is available. For KNC (gather/scatter), it's not helpful to factor base+offsets gathers and scatters into base_ptr + {1/2/4/8} * varying_offsets + const_offsets. Now, if a HW instruction is available for gather/scatter, we just factor into base + {1/2/4/8} * offsets (if possible). Not only is this simpler, but it's also what we need to pass a value along to the scale by 2/4/8 available directly in those instructions. Finishes issue #325. --- builtins/target-generic-common.ll | 19 ++- examples/intrinsics/generic-16.h | 69 ++++---- examples/intrinsics/generic-32.h | 68 ++++---- examples/intrinsics/generic-64.h | 74 ++++----- examples/intrinsics/knc.h | 100 ++++-------- examples/intrinsics/sse4.h | 258 +++++++++++++----------------- ispc.cpp | 5 + 7 files changed, 257 insertions(+), 336 deletions(-) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index c54dd948..77c7aabe 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -32,6 +32,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32"; define(`MASK',`i1') +define(`HAVE_GATHER',`1') +define(`HAVE_SCATTER',`1') + include(`util.m4') stdlib_core() @@ -334,19 +337,19 @@ define void @__masked_store_blend_double(* nocapture, ;; gather/scatter define(`gather_scatter', ` -declare @__gather_factored_base_offsets32_$1(i8 * nocapture, , - i32, , ) nounwind readonly -declare @__gather_factored_base_offsets64_$1(i8 * nocapture, , - i32, , ) nounwind readonly +declare @__gather_base_offsets32_$1(i8 * nocapture, i32, , + ) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture, i32, , + ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly -declare void @__scatter_factored_base_offsets32_$1(i8* nocapture, , - i32, , , ) nounwind -declare void @__scatter_factored_base_offsets64_$1(i8* nocapture, , - i32, , , ) nounwind +declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, , + , ) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, , + , ) nounwind declare void @__scatter32_$1(, , ) nounwind declare void @__scatter64_$1(, , diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index c18e9fbe..42978701 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1306,34 +1306,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec16_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \ @@ -1361,32 +1359,31 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec16_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index c1f89cd8..94946f4a 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1374,34 +1374,32 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec32_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec32_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \ @@ -1429,32 +1427,30 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec32_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec32_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec32_i8, int8_t, __vec32_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \ diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2a54446e..ff84fee3 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1507,40 +1507,38 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec64_i1 mask) { \ +#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec64_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ } -GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_factored_base_offsets32_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_factored_base_offsets32_float) -GATHER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_factored_base_offsets64_float) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_factored_base_offsets64_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_factored_base_offsets32_double) -GATHER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_factored_base_offsets64_double) +GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __gather_base_offsets32_i8) +GATHER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __gather_base_offsets64_i8) +GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i16) +GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16) +GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32) +GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32) +GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float) +GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float) +GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64) +GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64) +GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double) +GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ + if ((mask.v & (1ull << i)) != 0) { \ STYPE *ptr = (STYPE *)ptrs.v[i]; \ ret.v[i] = *ptr; \ } \ @@ -1562,32 +1560,30 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double) // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec64_i1 mask) { \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec64_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_factored_base_offsets32_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_factored_base_offsets32_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_factored_base_offsets64_float) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_factored_base_offsets64_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_factored_base_offsets32_double) -SCATTER_FACTORED_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_factored_base_offsets64_double) +SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i32, __scatter_base_offsets32_i8) +SCATTER_BASE_OFFSETS(__vec64_i8, int8_t, __vec64_i64, __scatter_base_offsets64_i8) +SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32_i16) +SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16) +SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32) +SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32) +SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float) +SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float) +SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64) +SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64) +SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double) +SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double) #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \ diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index fb11db11..a0331afb 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1940,60 +1940,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ - VTYPE ret; \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - ret.v[i] = *ptr; \ - } \ - return ret; \ -} -*/ - static FORCEINLINE __vec16_i32 -__gather_factored_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, - __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_i32 tmp; - +__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, + __vec16_i1 mask) { // Loop is generated by intrinsic __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, - _MM_UPCONV_EPI32_NONE, 1, + _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); return ret; } static FORCEINLINE __vec16_f -__gather_factored_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_f tmp; - // Loop is generated by intrinsic - __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, - _MM_UPCONV_PS_NONE, 1, + __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, + _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); return ret; } -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_factored_base_offsets32_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_factored_base_offsets64_i8) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_factored_base_offsets32_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_factored_base_offsets64_i16) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_factored_base_offsets64_i32) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_factored_base_offsets32_i64) -GATHER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_factored_base_offsets64_i64) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* @@ -2039,45 +2012,30 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) */ // scatter -#define SCATTER_FACTORED_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - *ptr = val.v[i]; \ - } \ -} -*/ - -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_factored_base_offsets32_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_factored_base_offsets64_i8) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_factored_base_offsets32_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_factored_base_offsets64_i16) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_factored_base_offsets64_i32) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_factored_base_offsets32_i64) -SCATTER_FACTORED_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_factored_base_offsets64_i64) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) static FORCEINLINE void -__scatter_factored_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset); - _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE void -__scatter_factored_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, - uint32_t scale, const __vec16_i32 &constOffset, - const __vec16_f &val, const __vec16_i1 mask) +__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset); - _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); } #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 088b694d..17ab8f18 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val, template static FORCEINLINE RetVec -lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i32 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i32()); - constOffset = __select(mask, constOffset, __setzero_i32()); - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + offset = scale * _mm_extract_epi32(offsets.v, 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + offset = scale * _mm_extract_epi32(offsets.v, 2); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + offset = scale * _mm_extract_epi32(offsets.v, 3); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + int offset = scale * _mm_extract_epi32(offsets.v, 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + int offset = scale * _mm_extract_epi32(offsets.v, 2); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + int offset = scale * _mm_extract_epi32(offsets.v, 3); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, template static FORCEINLINE RetVec -lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i64 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i64()); - constOffset = __select(mask, constOffset, __setzero_i64()); - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + offset = scale * _mm_extract_epi64(offsets.v[0], 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + offset = scale * _mm_extract_epi64(offsets.v[1], 0); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + offset = scale * _mm_extract_epi64(offsets.v[1], 1); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE __vec4_i8 -__gather_factored_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i8 -__gather_factored_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 -__gather_factored_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 - __gather_factored_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_factored_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_factored_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_factored_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_factored_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask); } template @@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ -__scatter_factored_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ - uint32_t scale, __vec4_i32 constOffset, \ +__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \ + __vec4_i32 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \ - _mm_extract_epi32(constOffset.v, 0)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \ - _mm_extract_epi32(constOffset.v, 1)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \ - _mm_extract_epi32(constOffset.v, 2)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \ - _mm_extract_epi32(constOffset.v, 3)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \ *ptr = EXTRACT(val.v, 3); \ } \ } \ -static FORCEINLINE void \ -__scatter_factored_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ - uint32_t scale, __vec4_i64 constOffset, \ +static FORCEINLINE void \ +__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \ + __vec4_i64 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \ - _mm_extract_epi64(constOffset.v[0], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \ - _mm_extract_epi64(constOffset.v[0], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \ - _mm_extract_epi64(constOffset.v[1], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \ - _mm_extract_epi64(constOffset.v[1], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 3); \ } \ @@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void -__scatter_factored_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, - __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) + - _mm_extract_epi32(constOffset.v, 0); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) + - _mm_extract_epi32(constOffset.v, 1); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) + - _mm_extract_epi32(constOffset.v, 2); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 0); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) + - _mm_extract_epi32(constOffset.v, 3); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 1); - } -} - -static FORCEINLINE void -__scatter_factored_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, +__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + - _mm_extract_epi64(constOffset.v[0], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + - _mm_extract_epi64(constOffset.v[0], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + - _mm_extract_epi64(constOffset.v[1], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 2); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 0); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + - _mm_extract_epi64(constOffset.v[1], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 3); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 1); } } static FORCEINLINE void -__scatter_factored_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_factored_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i64 val, __vec4_i1 mask) { + uint32_t m = _mm_extract_ps(mask.v, 0); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 0); + } + + m = _mm_extract_ps(mask.v, 1); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 1); + } + + m = _mm_extract_ps(mask.v, 2); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 0); + } + + m = _mm_extract_ps(mask.v, 3); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 1); + } } static FORCEINLINE void -__scatter_factored_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_factored_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets32_i64(p, scale, offsets, val, mask); +} + +static FORCEINLINE void +__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets64_i64(p, scale, offsets, val, mask); } diff --git a/ispc.cpp b/ispc.cpp index 0980c3d2..8fb8f0f5 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -254,6 +254,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-8")) { t->isa = Target::GENERIC; @@ -263,6 +264,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-16")) { t->isa = Target::GENERIC; @@ -272,6 +274,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-32")) { t->isa = Target::GENERIC; @@ -281,6 +284,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-64")) { t->isa = Target::GENERIC; @@ -290,6 +294,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-1")) { t->isa = Target::GENERIC; From df18b2a150ce72225733428bad9eb02d917c46f3 Mon Sep 17 00:00:00 2001 From: Jean-Luc Duprat Date: Wed, 11 Jul 2012 15:43:11 -0700 Subject: [PATCH 05/15] Fixed missing tmp var needed for use with gather intrinsic --- examples/intrinsics/knc.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index a0331afb..0cfb3d31 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1589,11 +1589,6 @@ CAST_BITS_SCALAR(double, int64_t) /////////////////////////////////////////////////////////////////////////// // various math functions -/* -static FORCEINLINE void __fastmath() { -} -*/ - static FORCEINLINE float __round_uniform_float(float v) { return roundf(v); } @@ -1943,7 +1938,7 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - // Loop is generated by intrinsic + __vec16_i32 tmp = _mm512_undefined_epi32(); __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); @@ -1953,7 +1948,7 @@ __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - // Loop is generated by intrinsic + __vec16_f tmp = _mm512_undefined_ps(); __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); From 2bacebb1fb792fa04fd99e65033e24453527f62f Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 11 Jul 2012 19:51:28 -0700 Subject: [PATCH 06/15] Doc fixes (Crystal Lemire). --- docs/ispc.rst | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 7e671dbd..f1f959c9 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -2084,7 +2084,7 @@ can be declared: soa<8> Point pts[...]; -The in-memory layout of the ``Point``s has had the SOA transformation +The in-memory layout of the ``Point`` instances has had the SOA transformation applied, such that there are 8 ``x`` values in memory followed by 8 ``y`` values, and so forth. Here is the effective declaration of ``soa<8> Point``: @@ -2266,7 +2266,7 @@ based on C++'s ``new`` and ``delete`` operators: // use ptr... delete[] ptr; -In the above code, each program instance allocates its own ``count`-sized +In the above code, each program instance allocates its own ``count`` sized array of ``uniform int`` values, uses that memory, and then deallocates that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are serviced by corresponding calls the system C library's ``malloc()`` and @@ -2277,9 +2277,7 @@ analogous to the corresponding rules are for pointers (as described in `Pointer Types`_.) Specifically, if a specific rate qualifier isn't provided with the ``new`` expression, then the default is that a "varying" ``new`` is performed, where each program instance performs a unique -allocation. The allocated type, in turn, is by default ``uniform`` for -``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new -expressions. +allocation. The allocated type, in turn, is by default ``uniform``. After a pointer has been deleted, it is illegal to access the memory it points to. However, that deletion happens on a per-program-instance basis. @@ -3491,7 +3489,7 @@ generates the following output on a four-wide compilation target: :: i = 10, x = [0.000000,1.000000,2.000000,3.000000] - added to x = [1.000000,2.000000,((2.000000)),((3.000000)] + added to x = [1.000000,2.000000,((2.000000)),((3.000000))] last print of x = [1.000000,2.000000,2.000000,3.000000] When a varying variable is printed, the values for program instances that @@ -4010,8 +4008,8 @@ Systems Programming Support Atomic Operations and Memory Fences ----------------------------------- -The standard range of atomic memory operations are provided by the standard -library``ispc``, including variants to handle both uniform and varying +The standard set of atomic memory operations are provided by the standard +library, including variants to handle both uniform and varying types as well as "local" and "global" atomics. Local atomics provide atomic behavior across the program instances in a From 2c640f7e522c87808d920320221ee331cf54ad71 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 12 Jul 2012 06:07:07 -0700 Subject: [PATCH 07/15] Add support for RDRAND in IvyBridge. The standard library now provides a variety of rdrand() functions that call out to RDRAND, when available. Issue #263. --- builtins.cpp | 3 + builtins/target-avx1-x2.ll | 2 + builtins/target-avx1.ll | 2 + builtins/target-avx11-x2.ll | 41 ++++++- builtins/target-avx11.ll | 41 ++++++- builtins/target-avx2-x2.ll | 4 +- builtins/target-avx2.ll | 4 +- builtins/target-generic-common.ll | 1 + builtins/target-sse2-common.ll | 1 + builtins/target-sse4-common.ll | 1 + builtins/util.m4 | 45 ++++++++ docs/ispc.rst | 35 ++++++ stdlib.ispc | 185 ++++++++++++++++++++++++++++++ tests/rdrand-1.ispc | 21 ++++ tests/rdrand-2.ispc | 19 +++ tests/rdrand-3.ispc | 25 ++++ tests/rdrand-4.ispc | 33 ++++++ tests/rdrand-5.ispc | 33 ++++++ tests/rdrand-6.ispc | 35 ++++++ 19 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 tests/rdrand-1.ispc create mode 100644 tests/rdrand-2.ispc create mode 100644 tests/rdrand-3.ispc create mode 100644 tests/rdrand-4.ispc create mode 100644 tests/rdrand-5.ispc create mode 100644 tests/rdrand-6.ispc diff --git a/builtins.cpp b/builtins.cpp index 00f72fc8..64f06e1f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__rcp_uniform_float", "__rcp_varying_float", + "__rdrand_i16", + "__rdrand_i32", + "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", "__reduce_add_int32", diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll index efde5d10..e06134d9 100644 --- a/builtins/target-avx1-x2.ll +++ b/builtins/target-avx1-x2.ll @@ -31,6 +31,8 @@ include(`target-avx-x2.ll') +rdrand_decls() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 64f8ad33..1b47955a 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -31,6 +31,8 @@ include(`target-avx.ll') +rdrand_decls() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index 884255df..cdb83726 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -29,9 +29,46 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`NO_HALF_DECLARES', `1') +include(`target-avx-x2.ll') -include(`target-avx1-x2.ll') +rdrand_definition() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %ret +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %ret +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index 35aebe91..d3ab9f13 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -29,9 +29,46 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`NO_HALF_DECLARES', `1') +include(`target-avx.ll') -include(`target-avx1.ll') +rdrand_definition() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %ret +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %ret +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 1ca3443c..1d2a2093 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -31,6 +31,8 @@ include(`target-avx-x2.ll') +rdrand_definition() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index 7152657e..45496779 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -31,6 +31,8 @@ include(`target-avx.ll') +rdrand_definition() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 77c7aabe..7b4cfd9c 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -40,6 +40,7 @@ include(`util.m4') stdlib_core() scans() reduce_equal(WIDTH) +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index e0b7f40c..c6a3afe2 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -33,6 +33,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 69461fcd..4b8751b5 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -33,6 +33,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/util.m4 b/builtins/util.m4 index 974c799c..614ac998 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3712,3 +3712,48 @@ define void @__scatter64_$1( %ptrs, %values, ' ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +define(`rdrand_decls', ` +declare i1 @__rdrand_i16(i16 * nocapture) +declare i1 @__rdrand_i32(i32 * nocapture) +declare i1 @__rdrand_i64(i64 * nocapture) +') + +define(`rdrand_definition', ` +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +declare {i16, i32} @llvm.x86.rdrand.16() +declare {i32, i32} @llvm.x86.rdrand.32() +declare {i64, i32} @llvm.x86.rdrand.64() + +define i1 @__rdrand_i16(i16 * %ptr) { + %v = call {i16, i32} @llvm.x86.rdrand.16() + %v0 = extractvalue {i16, i32} %v, 0 + %v1 = extractvalue {i16, i32} %v, 1 + store i16 %v0, i16 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i32(i32 * %ptr) { + %v = call {i32, i32} @llvm.x86.rdrand.32() + %v0 = extractvalue {i32, i32} %v, 0 + %v1 = extractvalue {i32, i32} %v, 1 + store i32 %v0, i32 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i64(i64 * %ptr) { + %v = call {i64, i32} @llvm.x86.rdrand.64() + %v0 = extractvalue {i64, i32} %v, 0 + %v1 = extractvalue {i64, i32} %v, 1 + store i64 %v0, i64 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} +') diff --git a/docs/ispc.rst b/docs/ispc.rst index f1f959c9..98250e39 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -140,6 +140,7 @@ Contents: * `Basic Math Functions`_ * `Transcendental Functions`_ * `Pseudo-Random Numbers`_ + * `Random Numbers`_ + `Output Functions`_ + `Assertions`_ @@ -3455,6 +3456,40 @@ be used to get a pseudo-random ``float`` value. uniform unsigned int32 random(RNGState * uniform state) uniform float frandom(uniform RNGState * uniform state) + +Random Numbers +-------------- + +Some recent CPUs (including those based on the Intel(r) Ivy Bridge +micro-architecture), provide support for generating true random numbers. A +few standard library functions make this functionality available: + +:: + + bool rdrand(uniform int32 * uniform ptr) + bool rdrand(varying int32 * uniform ptr) + bool rdrand(uniform int32 * varying ptr) + +If the processor doesn't have sufficient entropy to generate a random +number, then this function fails and returns ``false``. Otherwise, if the +processor is successful, the random value is stored in the given pointer +and ``true`` is returned. Therefore, this function should generally be +used as follows, called repeatedly until it is successful: + +:: + + int r; + while (rdrand(&r) == false) + ; // empty loop body + + +In addition to the ``int32`` variants of ``rdrand()`` listed above, there +are versions that return ``int16``, ``float``, and ``int64`` values as +well. + +Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the +``rdrand()`` functions always return ``false``. + Output Functions ---------------- diff --git a/stdlib.ispc b/stdlib.ispc index a7499930..3774c4a4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state, static inline void fastmath() { __fastmath(); } + +/////////////////////////////////////////////////////////////////////////// +// rdrand + +static inline uniform bool rdrand(float * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + uniform int32 irand; + uniform bool success = __rdrand_i32(&irand); + if (success) { + irand &= (1<<23)-1; + *ptr = floatbits(0x3F800000 | irand)-1.0f; + } + return success; + } +} + +static inline bool rdrand(varying float * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + // FIXME: it probably would be preferable, here and in the + // following rdrand() function, to do the int->float stuff + // in vector form. However, we need to be careful to not + // clobber any existing already-set values in *ptr with + // inactive lanes here... + irand &= (1<<23)-1; + *ptr = floatbits(0x3F800000 | irand)-1.0f; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(float * ptr) { + if (__have_native_rand == false) + return false; + else { + float * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + irand &= (1<<23)-1; + *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int16 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i16(ptr); +} + +static inline bool rdrand(varying int16 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int16 irand; + if (__rdrand_i16(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int16 * ptr) { + if (__have_native_rand == false) + return false; + else { + int16 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int16 irand; + if (__rdrand_i16(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int32 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i32(ptr); +} + +static inline bool rdrand(varying int32 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int32 * ptr) { + if (__have_native_rand == false) + return false; + else { + int32 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int64 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i64(ptr); +} + +static inline bool rdrand(varying int64 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int64 irand; + if (__rdrand_i64(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int64 * ptr) { + if (__have_native_rand == false) + return false; + else { + int64 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int64 irand; + if (__rdrand_i64(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} diff --git a/tests/rdrand-1.ispc b/tests/rdrand-1.ispc new file mode 100644 index 00000000..53ca6121 --- /dev/null +++ b/tests/rdrand-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + uniform float r = -1; + uniform int count = 0; + while (!rdrand(&r)) { + ++count; + } + RET[programIndex] = (r >= 0 && r < 1); + +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-2.ispc b/tests/rdrand-2.ispc new file mode 100644 index 00000000..7021a271 --- /dev/null +++ b/tests/rdrand-2.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + float r = -1; + while (!rdrand(&r)) + ; + RET[programIndex] = (r >= 0 && r < 1); + +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-3.ispc b/tests/rdrand-3.ispc new file mode 100644 index 00000000..a9fc93a3 --- /dev/null +++ b/tests/rdrand-3.ispc @@ -0,0 +1,25 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + int lessHalf = 0, moreHalf = 0; + for (uniform int i = 0; i < 1024*1024; ++i) { + float r = -1; + while (!rdrand(&r)) + ; + if (r < 0.5) ++lessHalf; + else ++moreHalf; + } + + float r = (double)lessHalf / (double)(lessHalf + moreHalf); + RET[programIndex] = (r >= .49 && r < .51); +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-4.ispc b/tests/rdrand-4.ispc new file mode 100644 index 00000000..3b38b7b1 --- /dev/null +++ b/tests/rdrand-4.ispc @@ -0,0 +1,33 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + uniform int set[64] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + uniform int64 r; + while (!rdrand(&r)) + ; + for (uniform int b = 0; b < 64; ++b) + if (((unsigned int64)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 64; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/rdrand-5.ispc b/tests/rdrand-5.ispc new file mode 100644 index 00000000..cbf59a97 --- /dev/null +++ b/tests/rdrand-5.ispc @@ -0,0 +1,33 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + int set[32] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + int32 r; + while (!rdrand(&r)) + ; + for (uniform int b = 0; b < 32; ++b) + if (((unsigned int32)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 32; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/rdrand-6.ispc b/tests/rdrand-6.ispc new file mode 100644 index 00000000..93137625 --- /dev/null +++ b/tests/rdrand-6.ispc @@ -0,0 +1,35 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + int set[32] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + uniform int32 rr[programCount]; + int * ptr = rr + programIndex; + while (!rdrand(ptr)) + ; + int32 r = rr[programIndex]; + for (uniform int b = 0; b < 32; ++b) + if (((unsigned int32)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 32; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} From e09e953bbbc55809fc4358f7194b3f85d52192fc Mon Sep 17 00:00:00 2001 From: Jean-Luc Duprat Date: Thu, 12 Jul 2012 10:32:38 -0700 Subject: [PATCH 08/15] Added a few functions: __setzero_i64() __cast_sext(__vec16_i64, __vec16_i32), __cast_zext(__vec16_i32) __min_varying_in32(), __min_varying_uint32(), __max_varying_int32(), __max_varying_uint32() Fixed the signature of __smear_i64() to match current codegen --- examples/intrinsics/knc.h | 59 +++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0cfb3d31..404cd24f 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { // int64 +static FORCEINLINE __vec16_i64 __setzero_i64() { + __vec16_i64 ret; + ret.v_lo = _mm512_setzero_epi32(); + ret.v_hi = _mm512_setzero_epi32(); + return ret; +} + static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) { __mmask16 carry = 0; @@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) return src[index+16] | (int64_t(src[index]) << 32); } -static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) { +static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) { const int *i = (const int*)&l; return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); } @@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); +} + #define CAST_SEXT_I1(TYPE) /* static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ @@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8) CAST_SEXT_I1(__vec16_i16) CAST_SEXT_I1(__vec16_i32) -static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) -{ - return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); -} - // zero extension CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) @@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16) CAST_ZEXT_I1(__vec16_i32) CAST_ZEXT_I1(__vec16_i64) +static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val.m, one); +} + + // truncations CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc) CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc) @@ -1654,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2); } +static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epu32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epu32(v1, v2); +} + BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double) BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double) -BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32) - BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) @@ -2033,6 +2059,17 @@ __scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, _MM_HINT_NONE); } +/* +static FORCEINLINE void +__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset, + uint32_t scale, const __vec16_i64 &constOffset, + const __vec16_f &val, const __vec16_i1 mask) +{ + __vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset); + _mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); +} +*/ + #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ From d180031ef0014611f17692935da36bdd75ad4315 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 12 Jul 2012 13:56:58 -0700 Subject: [PATCH 09/15] Add more tests of basic gather functionality. --- tests/gather-double-1.ispc | 17 +++++++++++++++++ tests/gather-double-2.ispc | 17 +++++++++++++++++ tests/gather-double-3.ispc | 21 +++++++++++++++++++++ tests/gather-double-4.ispc | 21 +++++++++++++++++++++ tests/gather-double-5.ispc | 19 +++++++++++++++++++ tests/gather-double-6.ispc | 19 +++++++++++++++++++ tests/gather-double-7.ispc | 23 +++++++++++++++++++++++ tests/gather-double-8.ispc | 23 +++++++++++++++++++++++ tests/gather-float-1.ispc | 17 +++++++++++++++++ tests/gather-float-2.ispc | 17 +++++++++++++++++ tests/gather-float-3.ispc | 21 +++++++++++++++++++++ tests/gather-float-4.ispc | 21 +++++++++++++++++++++ tests/gather-float-5.ispc | 19 +++++++++++++++++++ tests/gather-float-6.ispc | 19 +++++++++++++++++++ tests/gather-float-7.ispc | 23 +++++++++++++++++++++++ tests/gather-float-8.ispc | 23 +++++++++++++++++++++++ tests/gather-int16-1.ispc | 24 +++++++++++------------- tests/gather-int16-2.ispc | 17 +++++++++++++++++ tests/gather-int16-3.ispc | 21 +++++++++++++++++++++ tests/gather-int16-4.ispc | 21 +++++++++++++++++++++ tests/gather-int16-5.ispc | 19 +++++++++++++++++++ tests/gather-int16-6.ispc | 19 +++++++++++++++++++ tests/gather-int16-7.ispc | 23 +++++++++++++++++++++++ tests/gather-int16-8.ispc | 23 +++++++++++++++++++++++ tests/gather-int32-1.ispc | 17 +++++++++++++++++ tests/gather-int32-2.ispc | 17 +++++++++++++++++ tests/gather-int32-3.ispc | 21 +++++++++++++++++++++ tests/gather-int32-4.ispc | 21 +++++++++++++++++++++ tests/gather-int32-5.ispc | 19 +++++++++++++++++++ tests/gather-int32-6.ispc | 19 +++++++++++++++++++ tests/gather-int32-7.ispc | 23 +++++++++++++++++++++++ tests/gather-int32-8.ispc | 23 +++++++++++++++++++++++ tests/gather-int64-1.ispc | 17 +++++++++++++++++ tests/gather-int64-2.ispc | 17 +++++++++++++++++ tests/gather-int64-3.ispc | 21 +++++++++++++++++++++ tests/gather-int64-4.ispc | 21 +++++++++++++++++++++ tests/gather-int64-5.ispc | 19 +++++++++++++++++++ tests/gather-int64-6.ispc | 19 +++++++++++++++++++ tests/gather-int64-7.ispc | 23 +++++++++++++++++++++++ tests/gather-int64-8.ispc | 23 +++++++++++++++++++++++ tests/gather-int8-1.ispc | 24 +++++++++++------------- tests/gather-int8-2.ispc | 17 +++++++++++++++++ tests/gather-int8-3.ispc | 21 +++++++++++++++++++++ tests/gather-int8-4.ispc | 21 +++++++++++++++++++++ tests/gather-int8-5.ispc | 19 +++++++++++++++++++ tests/gather-int8-6.ispc | 19 +++++++++++++++++++ tests/gather-int8-7.ispc | 23 +++++++++++++++++++++++ tests/gather-int8-8.ispc | 23 +++++++++++++++++++++++ 48 files changed, 948 insertions(+), 26 deletions(-) create mode 100644 tests/gather-double-1.ispc create mode 100644 tests/gather-double-2.ispc create mode 100644 tests/gather-double-3.ispc create mode 100644 tests/gather-double-4.ispc create mode 100644 tests/gather-double-5.ispc create mode 100644 tests/gather-double-6.ispc create mode 100644 tests/gather-double-7.ispc create mode 100644 tests/gather-double-8.ispc create mode 100644 tests/gather-float-1.ispc create mode 100644 tests/gather-float-2.ispc create mode 100644 tests/gather-float-3.ispc create mode 100644 tests/gather-float-4.ispc create mode 100644 tests/gather-float-5.ispc create mode 100644 tests/gather-float-6.ispc create mode 100644 tests/gather-float-7.ispc create mode 100644 tests/gather-float-8.ispc create mode 100644 tests/gather-int16-2.ispc create mode 100644 tests/gather-int16-3.ispc create mode 100644 tests/gather-int16-4.ispc create mode 100644 tests/gather-int16-5.ispc create mode 100644 tests/gather-int16-6.ispc create mode 100644 tests/gather-int16-7.ispc create mode 100644 tests/gather-int16-8.ispc create mode 100644 tests/gather-int32-1.ispc create mode 100644 tests/gather-int32-2.ispc create mode 100644 tests/gather-int32-3.ispc create mode 100644 tests/gather-int32-4.ispc create mode 100644 tests/gather-int32-5.ispc create mode 100644 tests/gather-int32-6.ispc create mode 100644 tests/gather-int32-7.ispc create mode 100644 tests/gather-int32-8.ispc create mode 100644 tests/gather-int64-1.ispc create mode 100644 tests/gather-int64-2.ispc create mode 100644 tests/gather-int64-3.ispc create mode 100644 tests/gather-int64-4.ispc create mode 100644 tests/gather-int64-5.ispc create mode 100644 tests/gather-int64-6.ispc create mode 100644 tests/gather-int64-7.ispc create mode 100644 tests/gather-int64-8.ispc create mode 100644 tests/gather-int8-2.ispc create mode 100644 tests/gather-int8-3.ispc create mode 100644 tests/gather-int8-4.ispc create mode 100644 tests/gather-int8-5.ispc create mode 100644 tests/gather-int8-6.ispc create mode 100644 tests/gather-int8-7.ispc create mode 100644 tests/gather-int8-8.ispc diff --git a/tests/gather-double-1.ispc b/tests/gather-double-1.ispc new file mode 100644 index 00000000..64575545 --- /dev/null +++ b/tests/gather-double-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-2.ispc b/tests/gather-double-2.ispc new file mode 100644 index 00000000..78b9423a --- /dev/null +++ b/tests/gather-double-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-3.ispc b/tests/gather-double-3.ispc new file mode 100644 index 00000000..cfa32f21 --- /dev/null +++ b/tests/gather-double-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-4.ispc b/tests/gather-double-4.ispc new file mode 100644 index 00000000..d7ad2f5e --- /dev/null +++ b/tests/gather-double-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-5.ispc b/tests/gather-double-5.ispc new file mode 100644 index 00000000..3b97816a --- /dev/null +++ b/tests/gather-double-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-6.ispc b/tests/gather-double-6.ispc new file mode 100644 index 00000000..1c464bd5 --- /dev/null +++ b/tests/gather-double-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-7.ispc b/tests/gather-double-7.ispc new file mode 100644 index 00000000..c73f3b4e --- /dev/null +++ b/tests/gather-double-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-8.ispc b/tests/gather-double-8.ispc new file mode 100644 index 00000000..52da874d --- /dev/null +++ b/tests/gather-double-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-1.ispc b/tests/gather-float-1.ispc new file mode 100644 index 00000000..18b3fd98 --- /dev/null +++ b/tests/gather-float-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-2.ispc b/tests/gather-float-2.ispc new file mode 100644 index 00000000..4f680814 --- /dev/null +++ b/tests/gather-float-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-3.ispc b/tests/gather-float-3.ispc new file mode 100644 index 00000000..9e81cd06 --- /dev/null +++ b/tests/gather-float-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-4.ispc b/tests/gather-float-4.ispc new file mode 100644 index 00000000..4f114fee --- /dev/null +++ b/tests/gather-float-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-5.ispc b/tests/gather-float-5.ispc new file mode 100644 index 00000000..16f0e81e --- /dev/null +++ b/tests/gather-float-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-6.ispc b/tests/gather-float-6.ispc new file mode 100644 index 00000000..d1136f9a --- /dev/null +++ b/tests/gather-float-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-7.ispc b/tests/gather-float-7.ispc new file mode 100644 index 00000000..f5b09dc4 --- /dev/null +++ b/tests/gather-float-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-8.ispc b/tests/gather-float-8.ispc new file mode 100644 index 00000000..3708f063 --- /dev/null +++ b/tests/gather-float-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-1.ispc b/tests/gather-int16-1.ispc index e6bedd7f..89675185 100644 --- a/tests/gather-int16-1.ispc +++ b/tests/gather-int16-1.ispc @@ -1,19 +1,17 @@ + export uniform int width() { return programCount; } -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 x[programCount]; - x[programIndex] = programIndex; - int a = aFOO[programIndex]-1; - unsigned int16 v; - if (programIndex < 2) - v = x[a]; - else - v = 2; - RET[programIndex] = v; +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; } export void result(uniform float RET[]) { - RET[programIndex] = 2; - RET[0] = 0; - RET[1] = 1; + RET[programIndex] = 1 + programIndex; } diff --git a/tests/gather-int16-2.ispc b/tests/gather-int16-2.ispc new file mode 100644 index 00000000..74fdab8c --- /dev/null +++ b/tests/gather-int16-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-3.ispc b/tests/gather-int16-3.ispc new file mode 100644 index 00000000..a197f754 --- /dev/null +++ b/tests/gather-int16-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-4.ispc b/tests/gather-int16-4.ispc new file mode 100644 index 00000000..db9a7217 --- /dev/null +++ b/tests/gather-int16-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-5.ispc b/tests/gather-int16-5.ispc new file mode 100644 index 00000000..8d6ced77 --- /dev/null +++ b/tests/gather-int16-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-6.ispc b/tests/gather-int16-6.ispc new file mode 100644 index 00000000..8d740856 --- /dev/null +++ b/tests/gather-int16-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-7.ispc b/tests/gather-int16-7.ispc new file mode 100644 index 00000000..a6236af5 --- /dev/null +++ b/tests/gather-int16-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-8.ispc b/tests/gather-int16-8.ispc new file mode 100644 index 00000000..66bc8e89 --- /dev/null +++ b/tests/gather-int16-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-1.ispc b/tests/gather-int32-1.ispc new file mode 100644 index 00000000..2df1dd7e --- /dev/null +++ b/tests/gather-int32-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-2.ispc b/tests/gather-int32-2.ispc new file mode 100644 index 00000000..61f5a024 --- /dev/null +++ b/tests/gather-int32-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-3.ispc b/tests/gather-int32-3.ispc new file mode 100644 index 00000000..e87eab33 --- /dev/null +++ b/tests/gather-int32-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-4.ispc b/tests/gather-int32-4.ispc new file mode 100644 index 00000000..8a6d7bb6 --- /dev/null +++ b/tests/gather-int32-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-5.ispc b/tests/gather-int32-5.ispc new file mode 100644 index 00000000..573666c7 --- /dev/null +++ b/tests/gather-int32-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-6.ispc b/tests/gather-int32-6.ispc new file mode 100644 index 00000000..0d59a8fc --- /dev/null +++ b/tests/gather-int32-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-7.ispc b/tests/gather-int32-7.ispc new file mode 100644 index 00000000..ebc724e5 --- /dev/null +++ b/tests/gather-int32-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-8.ispc b/tests/gather-int32-8.ispc new file mode 100644 index 00000000..03cd7c8b --- /dev/null +++ b/tests/gather-int32-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-1.ispc b/tests/gather-int64-1.ispc new file mode 100644 index 00000000..fe3d171b --- /dev/null +++ b/tests/gather-int64-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-2.ispc b/tests/gather-int64-2.ispc new file mode 100644 index 00000000..7a00439b --- /dev/null +++ b/tests/gather-int64-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-3.ispc b/tests/gather-int64-3.ispc new file mode 100644 index 00000000..7ddd559c --- /dev/null +++ b/tests/gather-int64-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-4.ispc b/tests/gather-int64-4.ispc new file mode 100644 index 00000000..92e004e3 --- /dev/null +++ b/tests/gather-int64-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-5.ispc b/tests/gather-int64-5.ispc new file mode 100644 index 00000000..76d95f2d --- /dev/null +++ b/tests/gather-int64-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-6.ispc b/tests/gather-int64-6.ispc new file mode 100644 index 00000000..9deaaa80 --- /dev/null +++ b/tests/gather-int64-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-7.ispc b/tests/gather-int64-7.ispc new file mode 100644 index 00000000..52df9d19 --- /dev/null +++ b/tests/gather-int64-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-8.ispc b/tests/gather-int64-8.ispc new file mode 100644 index 00000000..5cfa621b --- /dev/null +++ b/tests/gather-int64-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-1.ispc b/tests/gather-int8-1.ispc index 305b12ca..43961ff2 100644 --- a/tests/gather-int8-1.ispc +++ b/tests/gather-int8-1.ispc @@ -1,19 +1,17 @@ + export uniform int width() { return programCount; } -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 x[programCount]; - x[programIndex] = programIndex; - int a = aFOO[programIndex]-1; - unsigned int8 v; - if (programIndex < 2) - v = x[a]; - else - v = 2; - RET[programIndex] = v; +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; } export void result(uniform float RET[]) { - RET[programIndex] = 2; - RET[0] = 0; - RET[1] = 1; + RET[programIndex] = 1 + programIndex; } diff --git a/tests/gather-int8-2.ispc b/tests/gather-int8-2.ispc new file mode 100644 index 00000000..8e853d0e --- /dev/null +++ b/tests/gather-int8-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-3.ispc b/tests/gather-int8-3.ispc new file mode 100644 index 00000000..5650ab7f --- /dev/null +++ b/tests/gather-int8-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-4.ispc b/tests/gather-int8-4.ispc new file mode 100644 index 00000000..92386d5a --- /dev/null +++ b/tests/gather-int8-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-5.ispc b/tests/gather-int8-5.ispc new file mode 100644 index 00000000..d0440d77 --- /dev/null +++ b/tests/gather-int8-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-6.ispc b/tests/gather-int8-6.ispc new file mode 100644 index 00000000..840b309c --- /dev/null +++ b/tests/gather-int8-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-7.ispc b/tests/gather-int8-7.ispc new file mode 100644 index 00000000..c0190db0 --- /dev/null +++ b/tests/gather-int8-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-8.ispc b/tests/gather-int8-8.ispc new file mode 100644 index 00000000..3c5cd41e --- /dev/null +++ b/tests/gather-int8-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} From 371d4be8efb1b7801e7081ad1a2a995a6930ad6d Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 12 Jul 2012 14:10:59 -0700 Subject: [PATCH 10/15] Fix bugs in detection of Ivy Bridge systems. We were incorrectly characterizing them as basic AVX1 without further extensions, due to a bug in the logic to check CPU features. --- builtins/dispatch.ll | 70 +++++++++++++++++++++----------------------- ispc.cpp | 26 ++++++++-------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index b9db3543..f1d5a969 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -76,18 +76,19 @@ declare void @abort() noreturn ;; /* NOTE: the values returned below must be the same as the ;; corresponding enumerant values in Target::ISA. */ ;; if ((info[2] & (1 << 28)) != 0) { -;; // AVX1 for sure. Do we have AVX2? -;; // Call cpuid with eax=7, ecx=0 -;; __cpuid_count(info, 7, 0); -;; if ((info[1] & (1 << 5)) != 0) -;; return 4; // AVX2 -;; else { -;; if ((info[2] & (1 << 29)) != 0 && // F16C -;; (info[2] & (1 << 30)) != 0) // RDRAND -;; return 3; // AVX1 on IVB -;; else -;; return 2; // AVX1 -;; } +;; if ((info[2] & (1 << 29)) != 0 && // F16C +;; (info[2] & (1 << 30)) != 0) { // RDRAND +;; // So far, so good. AVX2? +;; // Call cpuid with eax=7, ecx=0 +;; int info2[4]; +;; __cpuid_count(info2, 7, 0); +;; if ((info2[1] & (1 << 5)) != 0) +;; return 4; +;; else +;; return 3; +;; } +;; // Regular AVX +;; return 2; ;; } ;; else if ((info[2] & (1 << 19)) != 0) ;; return 1; // SSE4 @@ -104,40 +105,37 @@ entry: %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3 %and = and i32 %asmresult5.i, 268435456 %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.else14, label %if.then + br i1 %cmp, label %if.else13, label %if.then if.then: ; preds = %entry - %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind - %asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1 - %and3 = and i32 %asmresult4.i29, 32 - %cmp4 = icmp eq i32 %and3, 0 - br i1 %cmp4, label %if.else, label %return + %1 = and i32 %asmresult5.i, 1610612736 + %2 = icmp eq i32 %1, 1610612736 + br i1 %2, label %if.then7, label %return -if.else: ; preds = %if.then - %asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2 - %2 = and i32 %asmresult5.i30, 1610612736 - %3 = icmp eq i32 %2, 1610612736 - br i1 %3, label %return, label %if.else13 - -if.else13: ; preds = %if.else +if.then7: ; preds = %if.then + %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind + %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1 + %and10 = lshr i32 %asmresult4.i28, 5 + %4 = and i32 %and10, 1 + %5 = add i32 %4, 3 br label %return -if.else14: ; preds = %entry - %and16 = and i32 %asmresult5.i, 524288 - %cmp17 = icmp eq i32 %and16, 0 - br i1 %cmp17, label %if.else19, label %return +if.else13: ; preds = %entry + %and15 = and i32 %asmresult5.i, 524288 + %cmp16 = icmp eq i32 %and15, 0 + br i1 %cmp16, label %if.else18, label %return -if.else19: ; preds = %if.else14 - %and21 = and i32 %asmresult6.i, 67108864 - %cmp22 = icmp eq i32 %and21, 0 - br i1 %cmp22, label %if.else24, label %return +if.else18: ; preds = %if.else13 + %and20 = and i32 %asmresult6.i, 67108864 + %cmp21 = icmp eq i32 %and20, 0 + br i1 %cmp21, label %if.else23, label %return -if.else24: ; preds = %if.else19 +if.else23: ; preds = %if.else18 tail call void @abort() noreturn nounwind unreachable -return: ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then - %retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ] +return: ; preds = %if.else18, %if.else13, %if.then7, %if.then + %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ] ret i32 %retval.0 } diff --git a/ispc.cpp b/ispc.cpp index 8fb8f0f5..15c8d4ae 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -94,20 +94,22 @@ lGetSystemISA() { int info[4]; __cpuid(info, 1); - if ((info[2] & (1 << 28)) != 0) { - // AVX1 for sure. Do we have AVX2? - // Call cpuid with eax=7, ecx=0 - __cpuidex(info, 7, 0); - if ((info[1] & (1 << 5)) != 0) - return "avx2"; - else { - // ivybridge? - if ((info[2] & (1 << 29)) != 0 && // F16C - (info[2] & (1 << 30)) != 0) // RDRAND - return "avx1.1"; + if ((info[2] & (1 << 28)) != 0) { // AVX + // AVX1 for sure.... + // Ivy Bridge? + if ((info[2] & (1 << 29)) != 0 && // F16C + (info[2] & (1 << 30)) != 0) { // RDRAND + // So far, so good. AVX2? + // Call cpuid with eax=7, ecx=0 + int info2[4]; + __cpuidex(info2, 7, 0); + if ((info2[1] & (1 << 5)) != 0) + return "avx2"; else - return "avx"; + return "avx1.1"; } + // Regular AVX + return "avx"; } else if ((info[2] & (1 << 19)) != 0) return "sse4"; From 9a1932eaf765d5f279d605a14138ce923f730171 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 13 Jul 2012 12:01:04 -0700 Subject: [PATCH 11/15] Only set gcc's "-msse4.2", etc, option when compiling for generic targets. We don't need it when ispc is just generating an object file directly, and gcc on OS X doesn't recognize -mavx. --- run_tests.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run_tests.py b/run_tests.py index 0cb78ed6..ea53b432 100755 --- a/run_tests.py +++ b/run_tests.py @@ -265,11 +265,9 @@ def run_test(filename): gcc_arch = '-m64' gcc_isa="" - if options.target == 'sse2' or options.target == 'sse2-x2': - gcc_isa = '-msse3' - if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4': + if options.target == 'generic-4': gcc_isa = '-msse4.2' - if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8': + if options.target == 'generic-8': gcc_isa = '-mavx' if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \ and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): From 98b2e0e426ac008151dd59cf4bf1650683015073 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 13 Jul 2012 12:14:10 -0700 Subject: [PATCH 12/15] Fixes for intrinsics unsupported in earlier LLVM versions. Specifically, don't use the half/float conversion routines with LLVM 3.0, and don't try to use RDRAND with anything before LLVM 3.2. --- builtins/target-avx11-x2.ll | 10 ++++++++-- builtins/target-avx11.ll | 8 +++++++- builtins/target-avx2-x2.ll | 9 +++++++-- builtins/target-avx2.ll | 8 +++++++- ispc.cpp | 18 ++++++++++++++++++ 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index cdb83726..1aa6345c 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -31,7 +31,9 @@ include(`target-avx-x2.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -73,6 +75,9 @@ gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -123,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } - +' +) diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index d3ab9f13..fea0a7c2 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -31,7 +31,9 @@ include(`target-avx.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -73,6 +75,9 @@ gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -107,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } +') diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 1d2a2093..6572783f 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -31,7 +31,9 @@ include(`target-avx-x2.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -68,6 +70,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -118,7 +123,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } - +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index 45496779..e36a74de 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -31,7 +31,9 @@ include(`target-avx.ll') -rdrand_definition() +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -68,6 +70,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -102,6 +107,7 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/ispc.cpp b/ispc.cpp index 15c8d4ae..636dfdd4 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -328,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; t->maskingIsFree = false; t->maskBitCount = 32; +#if !defined(LLVM_3_0) + // LLVM 3.1+ only t->hasHalf = true; + #if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + #endif +#endif } else if (!strcasecmp(isa, "avx1.1-x2")) { t->isa = Target::AVX11; @@ -338,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; t->maskingIsFree = false; t->maskBitCount = 32; +#if !defined(LLVM_3_0) + // LLVM 3.1+ only t->hasHalf = true; + #if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + #endif +#endif } #ifndef LLVM_3_0 else if (!strcasecmp(isa, "avx2")) { @@ -350,7 +362,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->maskBitCount = 32; t->hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; +#endif } else if (!strcasecmp(isa, "avx2-x2")) { t->isa = Target::AVX2; @@ -360,7 +375,10 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->maskBitCount = 32; t->hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; +#endif } #endif // !LLVM_3_0 else { From daf5aa8e8b0a0dfcaa75753ad9be4a5823d0200a Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 13 Jul 2012 12:14:53 -0700 Subject: [PATCH 13/15] Run inst combine before memory optimizations. We were previously emitting 64-bit indexing for some gathers where 32-bit was actually fine, due to some adds of constant vectors that hadn't been simplified to the result. --- opt.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/opt.cpp b/opt.cpp index 824c5bc7..a623466b 100644 --- a/opt.cpp +++ b/opt.cpp @@ -447,6 +447,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); } if (!g->opt.disableMaskAllOnOptimizations) { @@ -489,6 +490,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); if (g->opt.disableCoalescing == false && @@ -507,6 +509,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); } From 984a68c3a9657737846e8fe86c117a3294961824 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 13 Jul 2012 12:20:42 -0700 Subject: [PATCH 14/15] Rename gen_gather() macro to gen_gather_factored() --- builtins/target-avx1-x2.ll | 12 ++++++------ builtins/target-avx1.ll | 12 ++++++------ builtins/target-avx2-x2.ll | 12 ++++++------ builtins/target-avx2.ll | 12 ++++++------ builtins/target-generic-1.ll | 12 ++++++------ builtins/target-sse2-x2.ll | 12 ++++++------ builtins/target-sse2.ll | 12 ++++++------ builtins/target-sse4-x2.ll | 12 ++++++------ builtins/target-sse4.ll | 12 ++++++------ builtins/util.m4 | 2 +- 10 files changed, 55 insertions(+), 55 deletions(-) mode change 100755 => 100644 builtins/target-generic-1.ll diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll index e06134d9..562d7ff0 100644 --- a/builtins/target-avx1-x2.ll +++ b/builtins/target-avx1-x2.ll @@ -73,9 +73,9 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 1b47955a..9c86cab8 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -73,9 +73,9 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 6572783f..a2a4fd34 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -128,9 +128,9 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index e36a74de..4b4b38c5 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -112,9 +112,9 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll old mode 100755 new mode 100644 index 5e82b4f1..c5937c8e --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -34,12 +34,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 0260971a..ad19f899 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -444,12 +444,12 @@ masked_load(double, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 5f40d1eb..6558adc8 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -575,12 +575,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ef3a7746..0f7cb355 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -371,12 +371,12 @@ masked_load(double, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index ee57f6bd..b00bcfd6 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -474,12 +474,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/util.m4 b/builtins/util.m4 index 614ac998..d29bcbca 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3472,7 +3472,7 @@ pl_done: ;; $1: scalar type for which to generate functions to do gathers ; vec width, type -define(`gen_gather', ` +define(`gen_gather_factored', ` ;; Define the utility function to do the gather operation for a single element ;; of the type define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, From 6a410fc30e418d2f166558d95af5981c6a64abdc Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 13 Jul 2012 12:29:05 -0700 Subject: [PATCH 15/15] Emit gather instructions for the AVX2 targets. Issue #308. --- builtins/target-avx2-x2.ll | 427 ++++++++++++++++++++++++++++++++++++- builtins/target-avx2.ll | 315 ++++++++++++++++++++++++++- builtins/util.m4 | 93 +++++--- ispc.cpp | 2 + 4 files changed, 808 insertions(+), 29 deletions(-) diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index a2a4fd34..053fd078 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -29,6 +29,10 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + include(`target-avx-x2.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', @@ -128,9 +132,430 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather +declare void @llvm.trap() noreturn nounwind + +; $1: type +; $2: var base name +define(`extract_4s', ` + %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> +') + +; $1: type +; $2: var base name +define(`extract_8s', ` + %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, + <8 x i32> + %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, + <8 x i32> +') + +; $1: element type +; $2: ret name +; $3: v1 +; $4: v2 +define(`assemble_8s', ` + %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4, + <16 x i32> +') + +; $1: element type +; $2: ret name +; $3: v1 +; $4: v2 +; $5: v3 +; $6: v4 +define(`assemble_4s', ` + %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4, + <8 x i32> + %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6, + <8 x i32> + assemble_8s($1, $2, $2_1, $2_2) +') + +ifelse(LLVM_VERSION, `LLVM_3_0', ` gen_gather_factored(i8) gen_gather_factored(i16) gen_gather_factored(i32) gen_gather_factored(float) gen_gather_factored(i64) -gen_gather_factored(double) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + +gen_gather(i8) +gen_gather(i16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr, + <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + extract_8s(i32, offsets) + extract_8s(i32, vecmask) + + %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8) + %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8) + + assemble_8s(i32, v, v1, v2) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + extract_4s(i32, vecmask) + extract_4s(i64, offsets) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8) + %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8) + %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8) + + assemble_4s(i32, v, v1, v2, v3, v4) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + extract_8s(i32, ptrs) + extract_8s(i32, vecmask) + + %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1) + %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1) + + assemble_8s(i32, v, v1, v2) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + extract_4s(i64, ptrs) + extract_4s(i32, vecmask) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1) + %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1) + %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1) + + assemble_4s(i32, v, v1, v2, v3, v4) + + ret <16 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr, + <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <16 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_8s(i32, offsets) + extract_8s(float, mask) + + %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8) + %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8) + + assemble_8s(float, v, v1, v2) + + ret <16 x float> %v +} + + +define <16 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_4s(i64, offsets) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8) + %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8) + %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8) + + assemble_4s(float, v, v1, v2, v3, v4) + + ret <16 x float> %v +} + + +define <16 x float> @__gather32_float(<16 x i32> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_8s(float, mask) + extract_8s(i32, ptrs) + + %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1) + %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1) + + assemble_8s(float, v, v1, v2) + + ret <16 x float> %v +} + + +define <16 x float> @__gather64_float(<16 x i64> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_4s(i64, ptrs) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1) + %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1) + %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1) + + assemble_4s(float, v, v1, v2, v3, v4) + + ret <16 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i32, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + + +define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i64, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + + +define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i32, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + +define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i64, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <16 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i32, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8) + %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i64, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8) + %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather32_double(<16 x i32> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i32, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1) + %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1) + %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather64_double(<16 x i64> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i64, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1) + %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1) + %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + +') diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index 4b4b38c5..f4a0ee07 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -29,6 +29,10 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + include(`target-avx.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', @@ -112,9 +116,318 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather +declare void @llvm.trap() noreturn nounwind + +define(`extract_4s', ` + %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> + %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +ifelse(LLVM_VERSION, `LLVM_3_0', ` gen_gather_factored(i8) gen_gather_factored(i16) gen_gather_factored(i32) gen_gather_factored(float) gen_gather_factored(i64) -gen_gather_factored(double) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + +gen_gather(i8) +gen_gather(i16) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr, + <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8) + + ret <8 x i32> %v +} + + +define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + extract_4s(i32, vecmask) + extract_4s(i64, offsets) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i32> %v1, <4 x i32> %v2, + <8 x i32> + ret <8 x i32> %v +} + + +define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1) + ret <8 x i32> %v +} + + +define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + extract_4s(i64, ptrs) + extract_4s(i32, vecmask) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1) + + %v = shufflevector <4 x i32> %v1, <4 x i32> %v2, + <8 x i32> + ret <8 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr, + <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <8 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <8 x i32> %vecmask to <8 x float> + + %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets, <8 x float> %mask, i8 %scale8) + + ret <8 x float> %v +} + + +define <8 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <8 x i32> %vecmask to <8 x float> + extract_4s(i64, offsets) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8) + + %v = shufflevector <4 x float> %v1, <4 x float> %v2, + <8 x i32> + ret <8 x float> %v +} + + +define <8 x float> @__gather32_float(<8 x i32> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <8 x i32> %vecmask to <8 x float> + + %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs, <8 x float> %mask, i8 1) + + ret <8 x float> %v +} + + +define <8 x float> @__gather64_float(<8 x i64> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <8 x i32> %vecmask to <8 x float> + extract_4s(i64, ptrs) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1) + + %v = shufflevector <4 x float> %v1, <4 x float> %v2, + <8 x i32> + ret <8 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i32, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i64, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + + extract_4s(i32, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i64, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <8 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i32, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i64, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather32_double(<8 x i32> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i32, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather64_double(<8 x i64> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i64, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + + ret <8 x double> %v +} + +') diff --git a/builtins/util.m4 b/builtins/util.m4 index d29bcbca..a97336a7 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3471,6 +3471,40 @@ pl_done: ;; ;; $1: scalar type for which to generate functions to do gathers +define(`gen_gather_general', ` +; fully general 32-bit gather, takes array of pointers encoded as vector of i32s +define @__gather32_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} + +; fully general 64-bit gather, takes array of pointers encoded as vector of i32s +define @__gather64_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} +') + ; vec width, type define(`gen_gather_factored', ` ;; Define the utility function to do the gather operation for a single element @@ -3582,37 +3616,42 @@ define @__gather_factored_base_offsets64_$1(i8 * %ptr, %ret`'eval(WIDTH-1) } -; fully general 32-bit gather, takes array of pointers encoded as vector of i32s -define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` - %iptr_LANE_ID = extractelement %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * - %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE - store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID - ') +gen_gather_general($1) +' +) - %ret = load * %ret_ptr - ret %ret +; vec width, type +define(`gen_gather', ` + +gen_gather_factored($1) + +define +@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale_vec = bitcast i32 %offset_scale to <1 x i32> + %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, + zeroinitializer, %vecmask) + ret %v } -; fully general 64-bit gather, takes array of pointers encoded as vector of i32s -define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` - %iptr_LANE_ID = extractelement %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * - %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE - store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID - ') - - %ret = load * %ret_ptr - ret %ret +define +@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale64 = zext i32 %offset_scale to i64 + %scale_vec = bitcast i64 %scale64 to <1 x i64> + %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, + i32 1, zeroinitializer, %vecmask) + ret %v } + ' ) diff --git a/ispc.cpp b/ispc.cpp index 636dfdd4..fac83bbe 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -365,6 +365,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, #if !defined(LLVM_3_1) // LLVM 3.2+ only t->hasRand = true; + t->hasGather = true; #endif } else if (!strcasecmp(isa, "avx2-x2")) { @@ -378,6 +379,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, #if !defined(LLVM_3_1) // LLVM 3.2+ only t->hasRand = true; + t->hasGather = true; #endif } #endif // !LLVM_3_0