diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 1b310a2f..9d6c60d2 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -407,13 +407,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { %ret = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %0, <16 x i32> %1, - <16 x i32> zeroinitializer, i16 -1) + <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %ret } define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { %ret = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %0, <16 x i32> %1, - <16 x i32> zeroinitializer, i16 -1) + <16 x i32> zeroinitializer, i16 -1) ret <16 x i32> %ret } @@ -814,9 +814,37 @@ masked_load(i16, 2) masked_load(i32, 4) masked_load(i64, 8) -masked_load_float_double() -gen_masked_store(i8) +declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16) + +define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask_i16) + ret <16 x float> %res +} + + +declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) + +define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + %r0 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask_hi_i8) + %ptr_d = bitcast i8* %ptr to <16 x double>* + %ptr_lo = getelementptr <16 x double>, <16 x double>* %ptr_d, i32 8 + %ptr_lo_i8 = bitcast <16 x double>* %ptr_lo to i8* + %r1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr_lo_i8, <8 x double> zeroinitializer, i8 %mask_lo_i8) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, + <16 x i32> + ret <16 x double> %res +} + + +gen_masked_store(i8) ; llvm.x86.sse2.storeu.dq gen_masked_store(i16) gen_masked_store(i32) gen_masked_store(i64)