diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 194569ab..0ff1db0f 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -826,11 +826,14 @@ define <16 x i64> @__masked_load_i64(i8 * %ptr, <16 x i1> %mask) nounwind always %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, <8 x i32> %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 - %r0 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask_hi_i8) + %ptr_d = bitcast i8* %ptr to <16 x i64>* - %ptr_lo = getelementptr <16 x i64>, <16 x i64>* %ptr_d, i32 8 - %ptr_lo_i8 = bitcast <16 x i64>* %ptr_lo to i8* - %r1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr_lo_i8, <8 x i64> zeroinitializer, i8 %mask_lo_i8) + %ptr_hi = getelementptr <16 x i64>, <16 x i64>* %ptr_d, i32 0, i32 8 + %ptr_hi_i8 = bitcast i64* %ptr_hi to i8* + + %r0 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask_lo_i8) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr_hi_i8, <8 x i64> zeroinitializer, i8 %mask_hi_i8) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> @@ -845,7 +848,6 @@ define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly al ret <16 x float> %res } - declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { %mask_i16 = bitcast <16 x i1> %mask to i16 @@ -853,11 +855,14 @@ define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, <8 x i32> %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 - %r0 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask_hi_i8) + %ptr_d = bitcast i8* %ptr to <16 x double>* - %ptr_lo = getelementptr <16 x double>, <16 x double>* %ptr_d, i32 8 - %ptr_lo_i8 = bitcast <16 x double>* %ptr_lo to i8* - %r1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr_lo_i8, <8 x double> zeroinitializer, i8 %mask_lo_i8) + %ptr_hi = getelementptr <16 x double>, <16 x double>* %ptr_d, i32 0, i32 8 + %ptr_hi_i8 = bitcast double* %ptr_hi to i8* + + %r0 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask_lo_i8) + %r1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr_hi_i8, <8 x double> zeroinitializer, i8 %mask_hi_i8) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> @@ -885,16 +890,16 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64> %v, <16 x i1> %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 %ptr_i8 = bitcast <16 x i64>* %0 to i8* - %ptr_lo = getelementptr <16 x i64>, <16 x i64>* %0, i32 8 - %ptr_lo_i8 = bitcast <16 x i64>* %ptr_lo to i8* + %ptr_lo = getelementptr <16 x i64>, <16 x i64>* %0, i32 0, i32 8 + %ptr_lo_i8 = bitcast i64* %ptr_lo to i8* %v_lo = shufflevector <16 x i64> %v, <16 x i64> undef, <8 x i32> %v_hi = shufflevector <16 x i64> %v, <16 x i64> undef, <8 x i32> - call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_i8, <8 x i64> %v_hi, i8 %mask_hi_i8) - call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_lo_i8, <8 x i64> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_i8, <8 x i64> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_lo_i8, <8 x i64> %v_hi, i8 %mask_hi_i8) ret void } @@ -915,16 +920,16 @@ define void @__masked_store_double(<16 x double>* nocapture, <16 x double> %v, < %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 %ptr_i8 = bitcast <16 x double>* %0 to i8* - %ptr_lo = getelementptr <16 x double>, <16 x double>* %0, i32 8 - %ptr_lo_i8 = bitcast <16 x double>* %ptr_lo to i8* + %ptr_lo = getelementptr <16 x double>, <16 x double>* %0, i32 0, i32 8 + %ptr_lo_i8 = bitcast double* %ptr_lo to i8* %v_lo = shufflevector <16 x double> %v, <16 x double> undef, <8 x i32> %v_hi = shufflevector <16 x double> %v, <16 x double> undef, <8 x i32> - call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_i8, <8 x double> %v_hi, i8 %mask_hi_i8) - call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_lo_i8, <8 x double> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_i8, <8 x double> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_lo_i8, <8 x double> %v_hi, i8 %mask_hi_i8) ret void } @@ -946,33 +951,25 @@ define void @__masked_store_blend_i16(* nocapture, , define void @__masked_store_blend_i32(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_i32(<16 x i32>* %0, <16 x i32> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_float(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_float(<16 x float>* %0, <16 x float> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_i64(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_i64(<16 x i64>* %0, <16 x i64> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_double(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_double(<16 x double>* %0, <16 x double> %1, <16 x i1> %2) ret void }