From ba10b916483664dfaf9891010cee9e0dedf4931a Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskiy Date: Thu, 9 Jul 2015 15:36:03 +0300 Subject: [PATCH] [AVX-512]: masked_store was replaced --- builtins/target-avx512-common.ll | 95 ++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 16 deletions(-) diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 9d6c60d2..194569ab 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -811,12 +811,34 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { masked_load(i8, 1) masked_load(i16, 2) -masked_load(i32, 4) -masked_load(i64, 8) + +declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16) +define <16 x i32> @__masked_load_i32(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask_i16) + ret <16 x i32> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8) +define <16 x i64> @__masked_load_i64(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + %r0 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask_hi_i8) + %ptr_d = bitcast i8* %ptr to <16 x i64>* + %ptr_lo = getelementptr <16 x i64>, <16 x i64>* %ptr_d, i32 8 + %ptr_lo_i8 = bitcast <16 x i64>* %ptr_lo to i8* + %r1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr_lo_i8, <8 x i64> zeroinitializer, i8 %mask_lo_i8) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, + <16 x i32> + ret <16 x i64> %res +} declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16) - define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { %mask_i16 = bitcast <16 x i1> %mask to i16 %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask_i16) @@ -825,7 +847,6 @@ define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly al declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) - define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { %mask_i16 = bitcast <16 x i1> %mask to i16 %mask_lo_i8 = trunc i16 %mask_i16 to i8 @@ -846,22 +867,64 @@ define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly gen_masked_store(i8) ; llvm.x86.sse2.storeu.dq gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -define void @__masked_store_float( * nocapture, , - ) nounwind alwaysinline { - %ptr = bitcast * %0 to * - %val = bitcast %1 to - call void @__masked_store_i32( * %ptr, %val, %2) +declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16) +define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %ptr_i8 = bitcast <16 x i32>* %0 to i8* + call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr_i8, <16 x i32> %v, i16 %mask_i16) ret void } -define void @__masked_store_double( * nocapture, , - ) nounwind alwaysinline { - %ptr = bitcast * %0 to * - %val = bitcast %1 to - call void @__masked_store_i64( * %ptr, %val, %2) +declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8) +define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_i8 = bitcast <16 x i64>* %0 to i8* + %ptr_lo = getelementptr <16 x i64>, <16 x i64>* %0, i32 8 + %ptr_lo_i8 = bitcast <16 x i64>* %ptr_lo to i8* + + %v_lo = shufflevector <16 x i64> %v, <16 x i64> undef, + <8 x i32> + %v_hi = shufflevector <16 x i64> %v, <16 x i64> undef, + <8 x i32> + + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_i8, <8 x i64> %v_hi, i8 %mask_hi_i8) + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_lo_i8, <8 x i64> %v_lo, i8 %mask_lo_i8) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 ) +define void @__masked_store_float(<16 x float>* nocapture, <16 x float> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %ptr_i8 = bitcast <16 x float>* %0 to i8* + call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr_i8, <16 x float> %v, i16 %mask_i16) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8) +define void @__masked_store_double(<16 x double>* nocapture, <16 x double> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_i8 = bitcast <16 x double>* %0 to i8* + %ptr_lo = getelementptr <16 x double>, <16 x double>* %0, i32 8 + %ptr_lo_i8 = bitcast <16 x double>* %ptr_lo to i8* + + %v_lo = shufflevector <16 x double> %v, <16 x double> undef, + <8 x i32> + %v_hi = shufflevector <16 x double> %v, <16 x double> undef, + <8 x i32> + + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_i8, <8 x double> %v_hi, i8 %mask_hi_i8) + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_lo_i8, <8 x double> %v_lo, i8 %mask_lo_i8) ret void }