diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 465a6df3..f1adec58 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -417,18 +417,18 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { masked_load(i8, 1) masked_load(i16, 2) -declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask) -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) +declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x MfORi32> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask) define <16 x i32> @__masked_load_i32(i8 *, <16 x i32> %mask) nounwind alwaysinline { - %floatmask = bitcast <16 x i32> %mask to <16 x float> - %mask0 = shufflevector <16 x float> %floatmask, <16 x float> undef, + %floatmask = bitcast <16 x i32> %mask to <16 x MfORi32> + %mask0 = shufflevector <16 x MfORi32> %floatmask, <16 x MfORi32> undef, <8 x i32> - %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %mask0) - %mask1 = shufflevector <16 x float> %floatmask, <16 x float> undef, + %val0 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x MfORi32> %mask0) + %mask1 = shufflevector <16 x MfORi32> %floatmask, <16 x MfORi32> undef, <8 x i32> %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32 ;; 8x4 bytes = 32 - %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x float> %mask1) + %val1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %ptr1, <8 x MfORi32> %mask1) %retval = shufflevector <8 x float> %val0, <8 x float> %val1, <16 x i32> @__masked_load_i64(i8 *, <16 x i32> %mask) nounwind alwaysinli <8 x i32> %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef, <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - %mask1d = bitcast <8 x i32> %mask1 to <4 x double> - %mask2d = bitcast <8 x i32> %mask2 to <4 x double> - %mask3d = bitcast <8 x i32> %mask3 to <4 x double> + %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64> + %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64> + %mask2d = bitcast <8 x i32> %mask2 to <4 x MdORi64> + %mask3d = bitcast <8 x i32> %mask3 to <4 x MdORi64> - %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) + %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %mask0d) %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32 - %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d) + %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d) %ptr2 = getelementptr PTR_OP_ARGS(`i8') %0, i32 64 - %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x double> %mask2d) + %val2d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr2, <4 x MdORi64> %mask2d) %ptr3 = getelementptr PTR_OP_ARGS(`i8') %0, i32 96 - %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x double> %mask3d) + %val3d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr3, <4 x MdORi64> %mask3d) %val01 = shufflevector <4 x double> %val0d, <4 x double> %val1d, <8 x i32> @@ -484,28 +484,28 @@ gen_masked_store(i8) gen_masked_store(i16) ; note that mask is the 2nd parameter, not the 3rd one!! -declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>) -declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) +declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x MfORi32>, <8 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>) define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32>, <16 x i32>) nounwind alwaysinline { %ptr = bitcast <16 x i32> * %0 to i8 * %val = bitcast <16 x i32> %1 to <16 x float> - %mask = bitcast <16 x i32> %2 to <16 x float> + %mask = bitcast <16 x i32> %2 to <16 x MfORi32> %val0 = shufflevector <16 x float> %val, <16 x float> undef, <8 x i32> %val1 = shufflevector <16 x float> %val, <16 x float> undef, <8 x i32> - %mask0 = shufflevector <16 x float> %mask, <16 x float> undef, + %mask0 = shufflevector <16 x MfORi32> %mask, <16 x MfORi32> undef, <8 x i32> - %mask1 = shufflevector <16 x float> %mask, <16 x float> undef, + %mask1 = shufflevector <16 x MfORi32> %mask, <16 x MfORi32> undef, <8 x i32> - call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask0, <8 x float> %val0) + call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x MfORi32> %mask0, <8 x float> %val0) %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32 - call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x float> %mask1, <8 x float> %val1) + call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr1, <8 x MfORi32> %mask1, <8 x float> %val1) ret void } @@ -524,10 +524,10 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>, <8 x i32> %mask3 = shufflevector <16 x i32> %mask, <16 x i32> undef, <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - %mask1d = bitcast <8 x i32> %mask1 to <4 x double> - %mask2d = bitcast <8 x i32> %mask2 to <4 x double> - %mask3d = bitcast <8 x i32> %mask3 to <4 x double> + %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64> + %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64> + %mask2d = bitcast <8 x i32> %mask2 to <4 x MdORi64> + %mask3d = bitcast <8 x i32> %mask3 to <4 x MdORi64> %val0 = shufflevector <16 x double> %val, <16 x double> undef, <4 x i32> @@ -538,13 +538,13 @@ define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64>, %val3 = shufflevector <16 x double> %val, <16 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask0d, <4 x double> %val0) %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32 - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d, <4 x double> %val1) %ptr2 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 64 - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x double> %mask2d, <4 x double> %val2) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr2, <4 x MdORi64> %mask2d, <4 x double> %val2) %ptr3 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 96 - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x double> %mask3d, <4 x double> %val3) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr3, <4 x MdORi64> %mask3d, <4 x double> %val3) ret void } diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 8cf2a9de..683aed5b 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -399,12 +399,12 @@ reduce_equal(8) masked_load(i8, 1) masked_load(i16, 2) -declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask) -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) +declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x MfORi32> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask) define <8 x i32> @__masked_load_i32(i8 *, <8 x i32> %mask) nounwind alwaysinline { - %floatmask = bitcast <8 x i32> %mask to <8 x float> - %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x float> %floatmask) + %floatmask = bitcast <8 x i32> %mask to <8 x MfORi32> + %floatval = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8 * %0, <8 x MfORi32> %floatmask) %retval = bitcast <8 x float> %floatval to <8 x i32> ret <8 x i32> %retval } @@ -416,12 +416,12 @@ define <8 x i64> @__masked_load_i64(i8 *, <8 x i32> %mask) nounwind alwaysinline <8 x i32> %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef, <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - %mask1d = bitcast <8 x i32> %mask1 to <4 x double> + %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64> + %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64> - %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d) + %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %mask0d) %ptr1 = getelementptr PTR_OP_ARGS(`i8') %0, i32 32 - %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x double> %mask1d) + %val1d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d) %vald = shufflevector <4 x double> %val0d, <4 x double> %val1d, <8 x i32> @@ -438,15 +438,15 @@ gen_masked_store(i8) gen_masked_store(i16) ; note that mask is the 2nd parameter, not the 3rd one!! -declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>) -declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) +declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x MfORi32>, <8 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>) define void @__masked_store_i32(<8 x i32>* nocapture, <8 x i32>, <8 x i32>) nounwind alwaysinline { %ptr = bitcast <8 x i32> * %0 to i8 * %val = bitcast <8 x i32> %1 to <8 x float> - %mask = bitcast <8 x i32> %2 to <8 x float> - call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x float> %mask, <8 x float> %val) + %mask = bitcast <8 x i32> %2 to <8 x MfORi32> + call void @llvm.x86.avx.maskstore.ps.256(i8 * %ptr, <8 x MfORi32> %mask, <8 x float> %val) ret void } @@ -460,17 +460,17 @@ define void @__masked_store_i64(<8 x i64>* nocapture, <8 x i64>, %mask1 = shufflevector <8 x i32> %mask, <8 x i32> undef, <8 x i32> - %mask0d = bitcast <8 x i32> %mask0 to <4 x double> - %mask1d = bitcast <8 x i32> %mask1 to <4 x double> + %mask0d = bitcast <8 x i32> %mask0 to <4 x MdORi64> + %mask1d = bitcast <8 x i32> %mask1 to <4 x MdORi64> %val0 = shufflevector <8 x double> %val, <8 x double> undef, <4 x i32> %val1 = shufflevector <8 x double> %val, <8 x double> undef, <4 x i32> - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask0d, <4 x double> %val0) %ptr1 = getelementptr PTR_OP_ARGS(`i8') %ptr, i32 32 - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x double> %mask1d, <4 x double> %val1) + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr1, <4 x MdORi64> %mask1d, <4 x double> %val1) ret void } diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index 62298505..e4a76cb3 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -390,21 +390,21 @@ masked_load(i8, 1) masked_load(i16, 2) ;; avx intrinsics -declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask) -declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask) +declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x MfORi32> %mask) +declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x MdORi64> %mask) define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline { %mask = trunc <4 x i64> %mask64 to <4 x i32> - %floatmask = bitcast <4 x i32> %mask to <4 x float> - %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask) + %floatmask = bitcast <4 x i32> %mask to <4 x MfORi32> + %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x MfORi32> %floatmask) %retval = bitcast <4 x float> %floatval to <4 x i32> ret <4 x i32> %retval } define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline { - %doublemask = bitcast <4 x i64> %mask to <4 x double> - %doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask) + %doublemask = bitcast <4 x i64> %mask to <4 x MdORi64> + %doubleval = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x MdORi64> %doublemask) %retval = bitcast <4 x double> %doubleval to <4 x i64> ret <4 x i64> %retval } @@ -419,8 +419,8 @@ gen_masked_store(i16) ; note that mask is the 2nd parameter, not the 3rd one!! ;; avx intrinsics -declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x float>, <4 x float>) -declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>) +declare void @llvm.x86.avx.maskstore.ps (i8 *, <4 x MfORi32>, <4 x float>) +declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x MdORi64>, <4 x double>) define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, <4 x i64>) nounwind alwaysinline { @@ -428,8 +428,8 @@ define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, %ptr = bitcast <4 x i32> * %0 to i8 * %val = bitcast <4 x i32> %1 to <4 x float> - %mask = bitcast <4 x i32> %mask32 to <4 x float> - call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val) + %mask = bitcast <4 x i32> %mask32 to <4 x MfORi32> + call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x MfORi32> %mask, <4 x float> %val) ret void } @@ -437,8 +437,8 @@ define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>, <4 x i64>) nounwind alwaysinline { %ptr = bitcast <4 x i64> * %0 to i8 * %val = bitcast <4 x i64> %1 to <4 x double> - %mask = bitcast <4 x i64> %2 to <4 x double> - call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val) + %mask = bitcast <4 x i64> %2 to <4 x MdORi64> + call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x MdORi64> %mask, <4 x double> %val) ret void } diff --git a/builtins/util.m4 b/builtins/util.m4 index 7c952ceb..dcd8addc 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -60,6 +60,23 @@ define(`PTR_OP_ARGS', ) ) +;; x86 mask load/stores have different mask type since 3.8 + +define(`MdORi64', + ifelse(LLVM_VERSION, LLVM_3_8, + ``i64'', + ``double'' + ) +) + +define(`MfORi32', + ifelse(LLVM_VERSION, LLVM_3_8, + ``i32'', + ``float'' + ) +) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector convertation utilities