diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 2fff6827..28cd0287 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -54,7 +54,6 @@ aossoa() ;; half conversion routines declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone -; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { @@ -204,42 +203,146 @@ define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats -declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone +declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p) +declare <16 x float> @llvm.floor.v16f32(<16 x float> %p) +declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p) + define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { - ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 - round8to16(%0, 8) + %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %0) + ret <16 x float> %res } define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { - ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 - round8to16(%0, 9) + %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %0) + ret <16 x float> %res } define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { - ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - round8to16(%0, 10) + %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %0) + ret <16 x float> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding doubles -declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone +declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p) +declare <8 x double> @llvm.floor.v8f64(<8 x double> %p) +declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p) + define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { - round4to16double(%0, 8) + %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %r0 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %v0) + %r1 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %v1) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> + ret <16 x double> %res } define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { - round4to16double(%0, 9) + %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %r0 = call <8 x double> @llvm.floor.v8f64(<8 x double> %v0) + %r1 = call <8 x double> @llvm.floor.v8f64(<8 x double> %v1) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> + ret <16 x double> %res } define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { - round4to16double(%0, 10) + %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> + %r0 = call <8 x double> @llvm.ceil.v8f64(<8 x double> %v0) + %r1 = call <8 x double> @llvm.ceil.v8f64(<8 x double> %v1) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> + ret <16 x double> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max -int64minmax() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64/uint64 min/max +define i64 @__max_uniform_int64(i64, i64) nounwind readonly alwaysinline { + %c = icmp sgt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__max_uniform_uint64(i64, i64) nounwind readonly alwaysinline { + %c = icmp ugt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_int64(i64, i64) nounwind readonly alwaysinline { + %c = icmp slt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define i64 @__min_uniform_uint64(i64, i64) nounwind readonly alwaysinline { + %c = icmp ult i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <16 x i64> @__max_varying_int64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline { + %v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + %v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + %r0 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> + ret <16 x i64> %res +} + +define <16 x i64> @__max_varying_uint64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline { + %v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + %v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + + %r0 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> + ret <16 x i64> %res +} + +define <16 x i64> @__min_varying_int64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline { + %v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + %v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + + %r0 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> + ret <16 x i64> %res +} + +define <16 x i64> @__min_varying_uint64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline { + %v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> + %v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + %v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> + + %r0 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1) + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> + ret <16 x i64> %res +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -256,19 +359,17 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline { ret float %ret } -declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone -declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) -define <16 x float> @__max_varying_float(<16 x float>, - <16 x float>) nounwind readonly alwaysinline { - binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1) - ret <16 x float> %call +define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %0, <16 x float> %1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res } -define <16 x float> @__min_varying_float(<16 x float>, - <16 x float>) nounwind readonly alwaysinline { - binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1) - ret <16 x float> %call +define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline { + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %0, <16 x float> %1, <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -301,30 +402,34 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { ret i32 %ret } -declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly -declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly +declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { - binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1) - ret <16 x i32> %m + %ret = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %0, <16 x i32> %1, + <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %ret } define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { - binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1) - ret <16 x i32> %m + %ret = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %0, <16 x i32> %1, + <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %ret } -declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly -declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly +declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { - binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1) - ret <16 x i32> %m + %ret = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %0, <16 x i32> %1, + <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %ret } define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { - binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1) - ret <16 x i32> %m + %ret = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %0, <16 x i32> %1, + <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %ret } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -342,17 +447,47 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli ret double %ret } -declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone -declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) +declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline { - binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1) - ret <16 x double> %ret + %a_0 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %a_1 = shufflevector <16 x double> %1, <16 x double> undef, + <8 x i32> + %res_a = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a_0, <8 x double> %a_1, + <8 x double> zeroinitializer, i8 -1, i32 4) + %b_0 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %b_1 = shufflevector <16 x double> %1, <16 x double> undef, + <8 x i32> + %res_b = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %b_0, <8 x double> %b_1, + <8 x double> zeroinitializer, i8 -1, i32 4) + %res = shufflevector <8 x double> %res_a, <8 x double> %res_b, + <16 x i32> + ret <16 x double> %res } define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline { - binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1) - ret <16 x double> %ret + %a_0 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %a_1 = shufflevector <16 x double> %1, <16 x double> undef, + <8 x i32> + %res_a = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a_0, <8 x double> %a_1, + <8 x double> zeroinitializer, i8 -1, i32 4) + %b_0 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %b_1 = shufflevector <16 x double> %1, <16 x double> undef, + <8 x i32> + %res_b = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %b_0, <8 x double> %b_1, + <8 x double> zeroinitializer, i8 -1, i32 4) + %res = shufflevector <8 x double> %res_a, <8 x double> %res_b, + <16 x i32> + ret <16 x double> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -376,24 +511,11 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { ret float %half_scale } -declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { - ; float is = __rsqrt_v(v); - unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v) - ; return 0.5 * is * (3. - (v * is) * is); - %v_is = fmul <16 x float> %v, %is - %v_is_is = fmul <16 x float> %v_is, %is - %three_sub = fsub <16 x float> , %v_is_is - %is_mul = fmul <16 x float> %is, %three_sub - %half_scale = fmul <16 x float> , %is_mul - ret <16 x float> %half_scale + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8) + ret <16 x float> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -416,21 +538,11 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { ret float %iv_mul } -declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { - ; float iv = __rcp_v(v); - ; return iv * (2. - v * iv); - - unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0) - ; do one N-R iteration - %v_iv = fmul <16 x float> %0, %call - %two_minus = fsub <16 x float> , %v_iv - %iv_mul = fmul <16 x float> %call, %two_minus - ret <16 x float> %iv_mul + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8) + ret <16 x float> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -443,11 +555,11 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { ret float %ret } -declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { - unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0) - ret <16 x float> %call + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -460,11 +572,19 @@ define double @__sqrt_uniform_double(double) nounwind alwaysinline { ret double %ret } -declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline { - unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0) - ret <16 x double> %ret + %v0 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %v1 = shufflevector <16 x double> %0, <16 x double> undef, + <8 x i32> + %r0 = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %v0, <8 x double> zeroinitializer, i8 -1, i32 4) + %r1 = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %v1, <8 x double> zeroinitializer, i8 -1, i32 4) + %res = shufflevector <8 x double> %r0, <8 x double> %r1, + <16 x i32> + ret <16 x double> %res } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; bit ops @@ -691,29 +811,125 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline { masked_load(i8, 1) masked_load(i16, 2) -masked_load(i32, 4) -masked_load(i64, 8) -masked_load_float_double() +declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16) +define <16 x i32> @__masked_load_i32(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask_i16) + ret <16 x i32> %res +} -gen_masked_store(i8) +declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8) +define <16 x i64> @__masked_load_i64(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_d = bitcast i8* %ptr to <16 x i64>* + %ptr_hi = getelementptr PTR_OP_ARGS(`<16 x i64>') %ptr_d, i32 0, i32 8 + %ptr_hi_i8 = bitcast i64* %ptr_hi to i8* + + %r0 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask_lo_i8) + %r1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr_hi_i8, <8 x i64> zeroinitializer, i8 %mask_hi_i8) + + %res = shufflevector <8 x i64> %r0, <8 x i64> %r1, + <16 x i32> + ret <16 x i64> %res +} + + +declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16) +define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask_i16) + ret <16 x float> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8) +define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_d = bitcast i8* %ptr to <16 x double>* + %ptr_hi = getelementptr PTR_OP_ARGS(`<16 x double>') %ptr_d, i32 0, i32 8 + %ptr_hi_i8 = bitcast double* %ptr_hi to i8* + + %r0 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask_lo_i8) + %r1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr_hi_i8, <8 x double> zeroinitializer, i8 %mask_hi_i8) + + %res = shufflevector <8 x double> %r0, <8 x double> %r1, + <16 x i32> + ret <16 x double> %res +} + + +gen_masked_store(i8) ; llvm.x86.sse2.storeu.dq gen_masked_store(i16) -gen_masked_store(i32) -gen_masked_store(i64) -define void @__masked_store_float( * nocapture, , - ) nounwind alwaysinline { - %ptr = bitcast * %0 to * - %val = bitcast %1 to - call void @__masked_store_i32( * %ptr, %val, %2) +declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16) +define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %ptr_i8 = bitcast <16 x i32>* %0 to i8* + call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr_i8, <16 x i32> %v, i16 %mask_i16) ret void } -define void @__masked_store_double( * nocapture, , - ) nounwind alwaysinline { - %ptr = bitcast * %0 to * - %val = bitcast %1 to - call void @__masked_store_i64( * %ptr, %val, %2) +declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8) +define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_i8 = bitcast <16 x i64>* %0 to i8* + %ptr_lo = getelementptr PTR_OP_ARGS(`<16 x i64>') %0, i32 0, i32 8 + %ptr_lo_i8 = bitcast i64* %ptr_lo to i8* + + %v_lo = shufflevector <16 x i64> %v, <16 x i64> undef, + <8 x i32> + %v_hi = shufflevector <16 x i64> %v, <16 x i64> undef, + <8 x i32> + + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_i8, <8 x i64> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_lo_i8, <8 x i64> %v_hi, i8 %mask_hi_i8) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 ) +define void @__masked_store_float(<16 x float>* nocapture, <16 x float> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %ptr_i8 = bitcast <16 x float>* %0 to i8* + call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr_i8, <16 x float> %v, i16 %mask_i16) + ret void +} + +declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8) +define void @__masked_store_double(<16 x double>* nocapture, <16 x double> %v, <16 x i1> %mask) nounwind alwaysinline { + %mask_i16 = bitcast <16 x i1> %mask to i16 + %mask_lo_i8 = trunc i16 %mask_i16 to i8 + %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef, + <8 x i32> + %mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8 + + %ptr_i8 = bitcast <16 x double>* %0 to i8* + %ptr_lo = getelementptr PTR_OP_ARGS(`<16 x double>') %0, i32 0, i32 8 + %ptr_lo_i8 = bitcast double* %ptr_lo to i8* + + %v_lo = shufflevector <16 x double> %v, <16 x double> undef, + <8 x i32> + %v_hi = shufflevector <16 x double> %v, <16 x double> undef, + <8 x i32> + + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_i8, <8 x double> %v_lo, i8 %mask_lo_i8) + call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_lo_i8, <8 x double> %v_hi, i8 %mask_hi_i8) ret void } @@ -735,33 +951,25 @@ define void @__masked_store_blend_i16(* nocapture, , define void @__masked_store_blend_i32(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_i32(<16 x i32>* %0, <16 x i32> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_float(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_float(<16 x float>* %0, <16 x float> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_i64(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_i64(<16 x i64>* %0, <16 x i64> %1, <16 x i1> %2) ret void } define void @__masked_store_blend_double(* nocapture, , ) nounwind alwaysinline { - %v = load PTR_OP_ARGS(` ') %0 - %v1 = select %2, %1, %v - store %v1, * %0 + call void @__masked_store_double(<16 x double>* %0, <16 x double> %1, <16 x i1> %2) ret void } diff --git a/fail_db.txt b/fail_db.txt index 5c8c3f1e..dedb6681 100644 --- a/fail_db.txt +++ b/fail_db.txt @@ -166,85 +166,26 @@ ./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * ./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * ./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * -./tests/operators2.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/acos.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/asin.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/operators2.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/packed-store-1.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/packed-store2-1.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * -./tests/acos.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/asin.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-5.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-6.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-7.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-8.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -./tests/short-circuit-9.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * -.\tests\acos.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\asin.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\memcpy-varying.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\operators2.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\packed-store-1.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\packed-store2-1.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * -.\tests\acos.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\asin.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-5.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-6.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-7.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-8.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -.\tests\short-circuit-9.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 * -./tests/operators2.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/rand-distrib.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/shift-1.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/foreach-active-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/int64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/int64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/int64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/int64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/rand-distrib-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/rotate.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/shuffle2-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/uint64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/uint64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/uint64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/uint64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/int64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/int64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/int64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/int64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/paddus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/paddus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/psubus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/psubus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/uint64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/uint64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/uint64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/uint64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 * -./tests/rand-distrib.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/shift-1.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * -./tests/shuffle2-5.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * +./tests/foreach-active-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 * +./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/paddus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/paddus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/psubus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/psubus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 * +./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *