From d37455925fe8dac7b06d5cedfdb2402d263d2661 Mon Sep 17 00:00:00 2001 From: Anton Mitrokhin Date: Sun, 16 Aug 2015 21:41:11 +0300 Subject: [PATCH] [AVX-512]: Scatters for i32/float --- builtins/target-avx512-common.ll | 136 ++++++++++++++++++++++++------- 1 file changed, 107 insertions(+), 29 deletions(-) diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 38256794..317b3ed8 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -976,24 +976,10 @@ define void @__masked_store_blend_double(* nocapture, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -define(`scatterbo32_64', ` -define void @__scatter_base_offsets32_$1(i8* %ptr, i32 %scale, %offsets, - %vals, %mask) nounwind { - call void @__scatter_factored_base_offsets32_$1(i8* %ptr, <16 x i32> %offsets, - i32 %scale, <16 x i32> zeroinitializer, <16 x $1> %vals, %mask) - ret void -} - -define void @__scatter_base_offsets64_$1(i8* %ptr, i32 %scale, %offsets, - %vals, %mask) nounwind { - call void @__scatter_factored_base_offsets64_$1(i8* %ptr, <16 x i64> %offsets, - i32 %scale, <16 x i64> zeroinitializer, <16 x $1> %vals, %mask) - ret void -} -') - - +;; gather - i8 gen_gather(i8) + +;; gather - i16 gen_gather(i16) ;; gather - i32 @@ -1032,11 +1018,9 @@ define <16 x i32> ret <16 x i32> %res } - ;; gather - i64 gen_gather(i64) - ;; gather - float declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) define <16 x float> @@ -1061,12 +1045,14 @@ define <16 x float> ret <16 x float> %res } -define <16 x float> @__gather32_float(<16 x i32> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { +define <16 x float> +@__gather32_float(<16 x i32> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { %res = call <16 x float> @__gather_base_offsets32_float(i8 * zeroinitializer, i32 1, <16 x i32> %ptrs, <16 x i1> %vecmask) ret <16 x float> %res } -define <16 x float> @__gather64_float(<16 x i64> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { +define <16 x float> +@__gather64_float(<16 x i64> %ptrs, <16 x i1> %vecmask) nounwind readonly alwaysinline { %res = call <16 x float> @__gather_base_offsets64_float(i8 * zeroinitializer, i32 1, <16 x i64> %ptrs, <16 x i1> %vecmask) ret <16 x float> %res } @@ -1075,18 +1061,110 @@ define <16 x float> @__gather64_float(<16 x i64> %ptrs, <16 x i1> %vecmask) nou gen_gather(double) -scatterbo32_64(i8) -scatterbo32_64(i16) -scatterbo32_64(i32) -scatterbo32_64(i64) -scatterbo32_64(float) -scatterbo32_64(double) +define(`scatterbo32_64', ` +define void @__scatter_base_offsets32_$1(i8* %ptr, i32 %scale, %offsets, + %vals, %mask) nounwind { + call void @__scatter_factored_base_offsets32_$1(i8* %ptr, <16 x i32> %offsets, + i32 %scale, <16 x i32> zeroinitializer, <16 x $1> %vals, %mask) + ret void +} +define void @__scatter_base_offsets64_$1(i8* %ptr, i32 %scale, %offsets, + %vals, %mask) nounwind { + call void @__scatter_factored_base_offsets64_$1(i8* %ptr, <16 x i64> %offsets, + i32 %scale, <16 x i64> zeroinitializer, <16 x $1> %vals, %mask) + ret void +} +') + +;; scatter - i8 +scatterbo32_64(i8) gen_scatter(i8) + +;; scatter - i16 +scatterbo32_64(i16) gen_scatter(i16) -gen_scatter(i32) + +;; scatter - i32 +declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) +define void +@__scatter_base_offsets32_i32(i8* %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x i32> %vals, <16 x i1> %vecmask) nounwind { + %mask = bitcast <16 x i1> %vecmask to i16 + call void @llvm.x86.avx512.scatter.dpi.512 (i8* %ptr, i16 %mask, <16 x i32> %offsets, <16 x i32> %vals, i32 %offset_scale) + ret void +} + +declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) +define void +@__scatter_base_offsets64_i32(i8* %ptr, i32 %offset_scale, <16 x i64> %offsets, <16 x i32> %vals, <16 x i1> %vecmask) nounwind { + %mask = bitcast <16 x i1> %vecmask to i16 + %mask_shifted = lshr i16 %mask, 8 + %mask_lo = trunc i16 %mask to i8 + %mask_hi = trunc i16 %mask_shifted to i8 + %offsets_lo = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %offsets_hi = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %res_lo = shufflevector <16 x i32> %vals, <16 x i32> undef, <8 x i32> + %res_hi = shufflevector <16 x i32> %vals, <16 x i32> undef, <8 x i32> + call void @llvm.x86.avx512.scatter.qpi.512 (i8* %ptr, i8 %mask_lo, <8 x i64> %offsets_lo, <8 x i32> %res_lo, i32 %offset_scale) + call void @llvm.x86.avx512.scatter.qpi.512 (i8* %ptr, i8 %mask_hi, <8 x i64> %offsets_hi, <8 x i32> %res_hi, i32 %offset_scale) + ret void +} + +define void +@__scatter32_i32(<16 x i32> %ptrs, <16 x i32> %values, <16 x i1> %vecmask) nounwind alwaysinline { + call void @__scatter_base_offsets32_i32(i8 * zeroinitializer, i32 1, <16 x i32> %ptrs, <16 x i32> %values, <16 x i1> %vecmask) + ret void +} + +define void +@__scatter64_i32(<16 x i64> %ptrs, <16 x i32> %values, <16 x i1> %vecmask) nounwind alwaysinline { + call void @__scatter_base_offsets64_i32(i8 * zeroinitializer, i32 1, <16 x i64> %ptrs, <16 x i32> %values, <16 x i1> %vecmask) + ret void +} + +;; scatter - i64 +scatterbo32_64(i64) gen_scatter(i64) -gen_scatter(float) + +;; scatter - float +declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) +define void +@__scatter_base_offsets32_float(i8* %ptr, i32 %offset_scale, <16 x i32> %offsets, <16 x float> %vals, <16 x i1> %vecmask) nounwind { + %mask = bitcast <16 x i1> %vecmask to i16 + call void @llvm.x86.avx512.scatter.dps.512 (i8* %ptr, i16 %mask, <16 x i32> %offsets, <16 x float> %vals, i32 %offset_scale) + ret void +} + +declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32) +define void +@__scatter_base_offsets64_float(i8* %ptr, i32 %offset_scale, <16 x i64> %offsets, <16 x float> %vals, <16 x i1> %vecmask) nounwind { + %mask = bitcast <16 x i1> %vecmask to i16 + %mask_shifted = lshr i16 %mask, 8 + %mask_lo = trunc i16 %mask to i8 + %mask_hi = trunc i16 %mask_shifted to i8 + %offsets_lo = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %offsets_hi = shufflevector <16 x i64> %offsets, <16 x i64> undef, <8 x i32> + %res_lo = shufflevector <16 x float> %vals, <16 x float> undef, <8 x i32> + %res_hi = shufflevector <16 x float> %vals, <16 x float> undef, <8 x i32> + call void @llvm.x86.avx512.scatter.qps.512 (i8* %ptr, i8 %mask_lo, <8 x i64> %offsets_lo, <8 x float> %res_lo, i32 %offset_scale) + call void @llvm.x86.avx512.scatter.qps.512 (i8* %ptr, i8 %mask_hi, <8 x i64> %offsets_hi, <8 x float> %res_hi, i32 %offset_scale) + ret void +} + +define void +@__scatter32_float(<16 x i32> %ptrs, <16 x float> %values, <16 x i1> %vecmask) nounwind alwaysinline { + call void @__scatter_base_offsets32_float(i8 * zeroinitializer, i32 1, <16 x i32> %ptrs, <16 x float> %values, <16 x i1> %vecmask) + ret void +} + +define void +@__scatter64_float(<16 x i64> %ptrs, <16 x float> %values, <16 x i1> %vecmask) nounwind alwaysinline { + call void @__scatter_base_offsets64_float(i8 * zeroinitializer, i32 1, <16 x i64> %ptrs, <16 x float> %values, <16 x i1> %vecmask) + ret void +} + +;; scatter - double +scatterbo32_64(double) gen_scatter(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;