Merge pull request #1066 from ncos/native-knl

Native knl
This commit is contained in:
Dmitry Babokin
2015-07-16 18:06:32 +03:00
2 changed files with 335 additions and 186 deletions

View File

@@ -54,7 +54,6 @@ aossoa()
;; half conversion routines ;; half conversion routines
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
@@ -204,42 +203,146 @@ define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %0)
round8to16(%0, 8) ret <16 x float> %res
} }
define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %0)
round8to16(%0, 9) ret <16 x float> %res
} }
define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %0)
round8to16(%0, 10) ret <16 x float> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles ;; rounding doubles
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline { define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
round4to16double(%0, 8) %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %v0)
%r1 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %v1)
%res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline { define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
round4to16double(%0, 9) %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x double> @llvm.floor.v8f64(<8 x double> %v0)
%r1 = call <8 x double> @llvm.floor.v8f64(<8 x double> %v1)
%res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline { define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
round4to16double(%0, 10) %v0 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1 = shufflevector <16 x double> %0, <16 x double> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x double> @llvm.ceil.v8f64(<8 x double> %v0)
%r1 = call <8 x double> @llvm.ceil.v8f64(<8 x double> %v1)
%res = shufflevector <8 x double> %r0, <8 x double> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; min/max ;; min/max
int64minmax() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64/uint64 min/max
define i64 @__max_uniform_int64(i64, i64) nounwind readonly alwaysinline {
%c = icmp sgt i64 %0, %1
%r = select i1 %c, i64 %0, i64 %1
ret i64 %r
}
define i64 @__max_uniform_uint64(i64, i64) nounwind readonly alwaysinline {
%c = icmp ugt i64 %0, %1
%r = select i1 %c, i64 %0, i64 %1
ret i64 %r
}
define i64 @__min_uniform_int64(i64, i64) nounwind readonly alwaysinline {
%c = icmp slt i64 %0, %1
%r = select i1 %c, i64 %0, i64 %1
ret i64 %r
}
define i64 @__min_uniform_uint64(i64, i64) nounwind readonly alwaysinline {
%c = icmp ult i64 %0, %1
%r = select i1 %c, i64 %0, i64 %1
ret i64 %r
}
declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
define <16 x i64> @__max_varying_int64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline {
%v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1)
%r1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1)
%res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i64> %res
}
define <16 x i64> @__max_varying_uint64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline {
%v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1)
%r1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1)
%res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i64> %res
}
define <16 x i64> @__min_varying_int64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline {
%v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1)
%r1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1)
%res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i64> %res
}
define <16 x i64> @__min_varying_uint64(<16 x i64>, <16 x i64>) nounwind readonly alwaysinline {
%v0_lo = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v0_hi = shufflevector <16 x i64> %0, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v1_lo = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1_hi = shufflevector <16 x i64> %1, <16 x i64> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %v0_lo, <8 x i64> %v1_lo, <8 x i64>zeroinitializer, i8 -1)
%r1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %v0_hi, <8 x i64> %v1_hi, <8 x i64>zeroinitializer, i8 -1)
%res = shufflevector <8 x i64> %r0, <8 x i64> %r1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i64> %res
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float min/max ;; float min/max
@@ -256,19 +359,17 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float> @__max_varying_float(<16 x float>, define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
<16 x float>) nounwind readonly alwaysinline { %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %0, <16 x float> %1, <16 x float>zeroinitializer, i16 -1, i32 4)
binary8to16(call, float, @llvm.x86.avx.max.ps.256, %0, %1) ret <16 x float> %res
ret <16 x float> %call
} }
define <16 x float> @__min_varying_float(<16 x float>, define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
<16 x float>) nounwind readonly alwaysinline { %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %0, <16 x float> %1, <16 x float>zeroinitializer, i16 -1, i32 4)
binary8to16(call, float, @llvm.x86.avx.min.ps.256, %0, %1) ret <16 x float> %res
ret <16 x float> %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -301,30 +402,34 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readonly declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readonly declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmins.d, %0, %1) %ret = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %0, <16 x i32> %1,
ret <16 x i32> %m <16 x i32> zeroinitializer, i16 -1)
ret <16 x i32> %ret
} }
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxs.d, %0, %1) %ret = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %0, <16 x i32> %1,
ret <16 x i32> %m <16 x i32> zeroinitializer, i16 -1)
ret <16 x i32> %ret
} }
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readonly declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readonly declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pminu.d, %0, %1) %ret = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %0, <16 x i32> %1,
ret <16 x i32> %m <16 x i32> zeroinitializer, i16 -1)
ret <16 x i32> %ret
} }
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary8to16(m, i32, @llvm.x86.avx2.pmaxu.d, %0, %1) %ret = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %0, <16 x i32> %1,
ret <16 x i32> %m <16 x i32> zeroinitializer, i16 -1)
ret <16 x i32> %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -342,17 +447,47 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
ret double %ret ret double %ret
} }
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone <8 x double>, i8, i32)
declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
<8 x double>, i8, i32)
define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline { define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
binary4to16(ret, double, @llvm.x86.avx.min.pd.256, %0, %1) %a_0 = shufflevector <16 x double> %0, <16 x double> undef,
ret <16 x double> %ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%a_1 = shufflevector <16 x double> %1, <16 x double> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res_a = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a_0, <8 x double> %a_1,
<8 x double> zeroinitializer, i8 -1, i32 4)
%b_0 = shufflevector <16 x double> %0, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%b_1 = shufflevector <16 x double> %1, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%res_b = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %b_0, <8 x double> %b_1,
<8 x double> zeroinitializer, i8 -1, i32 4)
%res = shufflevector <8 x double> %res_a, <8 x double> %res_b,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline { define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone alwaysinline {
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1) %a_0 = shufflevector <16 x double> %0, <16 x double> undef,
ret <16 x double> %ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%a_1 = shufflevector <16 x double> %1, <16 x double> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%res_a = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a_0, <8 x double> %a_1,
<8 x double> zeroinitializer, i8 -1, i32 4)
%b_0 = shufflevector <16 x double> %0, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%b_1 = shufflevector <16 x double> %1, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%res_b = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %b_0, <8 x double> %b_1,
<8 x double> zeroinitializer, i8 -1, i32 4)
%res = shufflevector <8 x double> %res_a, <8 x double> %res_b,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -376,24 +511,11 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %half_scale ret float %half_scale
} }
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
; float is = __rsqrt_v(v); %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
unary8to16(is, float, @llvm.x86.avx.rsqrt.ps.256, %v) ret <16 x float> %res
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is
%v_is_is = fmul <16 x float> %v_is, %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
ret <16 x float> %half_scale
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -416,21 +538,11 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
ret float %iv_mul ret float %iv_mul
} }
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
; float iv = __rcp_v(v); %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
; return iv * (2. - v * iv); ret <16 x float> %res
unary8to16(call, float, @llvm.x86.avx.rcp.ps.256, %0)
; do one N-R iteration
%v_iv = fmul <16 x float> %0, %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus
ret <16 x float> %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -443,11 +555,11 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
unary8to16(call, float, @llvm.x86.avx.sqrt.ps.256, %0) %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %0, <16 x float> zeroinitializer, i16 -1, i32 4)
ret <16 x float> %call ret <16 x float> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -460,11 +572,19 @@ define double @__sqrt_uniform_double(double) nounwind alwaysinline {
ret double %ret ret double %ret
} }
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline { define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind alwaysinline {
unary4to16(ret, double, @llvm.x86.avx.sqrt.pd.256, %0) %v0 = shufflevector <16 x double> %0, <16 x double> undef,
ret <16 x double> %ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v1 = shufflevector <16 x double> %0, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%r0 = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %v0, <8 x double> zeroinitializer, i8 -1, i32 4)
%r1 = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %v1, <8 x double> zeroinitializer, i8 -1, i32 4)
%res = shufflevector <8 x double> %r0, <8 x double> %r1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; bit ops ;; bit ops
@@ -691,29 +811,125 @@ define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone alwaysinline {
masked_load(i8, 1) masked_load(i8, 1)
masked_load(i16, 2) masked_load(i16, 2)
masked_load(i32, 4)
masked_load(i64, 8)
masked_load_float_double() declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
define <16 x i32> @__masked_load_i32(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask_i16)
ret <16 x i32> %res
}
gen_masked_store(i8) declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
define <16 x i64> @__masked_load_i64(i8 * %ptr, <16 x i1> %mask) nounwind alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%mask_lo_i8 = trunc i16 %mask_i16 to i8
%mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8
%ptr_d = bitcast i8* %ptr to <16 x i64>*
%ptr_hi = getelementptr PTR_OP_ARGS(`<16 x i64>') %ptr_d, i32 0, i32 8
%ptr_hi_i8 = bitcast i64* %ptr_hi to i8*
%r0 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask_lo_i8)
%r1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr_hi_i8, <8 x i64> zeroinitializer, i8 %mask_hi_i8)
%res = shufflevector <8 x i64> %r0, <8 x i64> %r1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i64> %res
}
declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
define <16 x float> @__masked_load_float(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask_i16)
ret <16 x float> %res
}
declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
define <16 x double> @__masked_load_double(i8 * %ptr, <16 x i1> %mask) readonly alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%mask_lo_i8 = trunc i16 %mask_i16 to i8
%mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8
%ptr_d = bitcast i8* %ptr to <16 x double>*
%ptr_hi = getelementptr PTR_OP_ARGS(`<16 x double>') %ptr_d, i32 0, i32 8
%ptr_hi_i8 = bitcast double* %ptr_hi to i8*
%r0 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask_lo_i8)
%r1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr_hi_i8, <8 x double> zeroinitializer, i8 %mask_hi_i8)
%res = shufflevector <8 x double> %r0, <8 x double> %r1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x double> %res
}
gen_masked_store(i8) ; llvm.x86.sse2.storeu.dq
gen_masked_store(i16) gen_masked_store(i16)
gen_masked_store(i32)
gen_masked_store(i64)
define void @__masked_store_float(<WIDTH x float> * nocapture, <WIDTH x float>, declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
<WIDTH x MASK>) nounwind alwaysinline { define void @__masked_store_i32(<16 x i32>* nocapture, <16 x i32> %v, <16 x i1> %mask) nounwind alwaysinline {
%ptr = bitcast <WIDTH x float> * %0 to <WIDTH x i32> * %mask_i16 = bitcast <16 x i1> %mask to i16
%val = bitcast <WIDTH x float> %1 to <WIDTH x i32> %ptr_i8 = bitcast <16 x i32>* %0 to i8*
call void @__masked_store_i32(<WIDTH x i32> * %ptr, <WIDTH x i32> %val, <WIDTH x MASK> %2) call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr_i8, <16 x i32> %v, i16 %mask_i16)
ret void ret void
} }
define void @__masked_store_double(<WIDTH x double> * nocapture, <WIDTH x double>, declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
<WIDTH x MASK>) nounwind alwaysinline { define void @__masked_store_i64(<16 x i64>* nocapture, <16 x i64> %v, <16 x i1> %mask) nounwind alwaysinline {
%ptr = bitcast <WIDTH x double> * %0 to <WIDTH x i64> * %mask_i16 = bitcast <16 x i1> %mask to i16
%val = bitcast <WIDTH x double> %1 to <WIDTH x i64> %mask_lo_i8 = trunc i16 %mask_i16 to i8
call void @__masked_store_i64(<WIDTH x i64> * %ptr, <WIDTH x i64> %val, <WIDTH x MASK> %2) %mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8
%ptr_i8 = bitcast <16 x i64>* %0 to i8*
%ptr_lo = getelementptr PTR_OP_ARGS(`<16 x i64>') %0, i32 0, i32 8
%ptr_lo_i8 = bitcast i64* %ptr_lo to i8*
%v_lo = shufflevector <16 x i64> %v, <16 x i64> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v_hi = shufflevector <16 x i64> %v, <16 x i64> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_i8, <8 x i64> %v_lo, i8 %mask_lo_i8)
call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr_lo_i8, <8 x i64> %v_hi, i8 %mask_hi_i8)
ret void
}
declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
define void @__masked_store_float(<16 x float>* nocapture, <16 x float> %v, <16 x i1> %mask) nounwind alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%ptr_i8 = bitcast <16 x float>* %0 to i8*
call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr_i8, <16 x float> %v, i16 %mask_i16)
ret void
}
declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
define void @__masked_store_double(<16 x double>* nocapture, <16 x double> %v, <16 x i1> %mask) nounwind alwaysinline {
%mask_i16 = bitcast <16 x i1> %mask to i16
%mask_lo_i8 = trunc i16 %mask_i16 to i8
%mask_hi = shufflevector <16 x i1> %mask, <16 x i1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask_hi_i8 = bitcast <8 x i1> %mask_hi to i8
%ptr_i8 = bitcast <16 x double>* %0 to i8*
%ptr_lo = getelementptr PTR_OP_ARGS(`<16 x double>') %0, i32 0, i32 8
%ptr_lo_i8 = bitcast double* %ptr_lo to i8*
%v_lo = shufflevector <16 x double> %v, <16 x double> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%v_hi = shufflevector <16 x double> %v, <16 x double> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_i8, <8 x double> %v_lo, i8 %mask_lo_i8)
call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr_lo_i8, <8 x double> %v_hi, i8 %mask_hi_i8)
ret void ret void
} }
@@ -735,33 +951,25 @@ define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture, <WIDTH x i16>,
define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>, define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture, <WIDTH x i32>,
<WIDTH x i1>) nounwind alwaysinline { <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x i32> ') %0 call void @__masked_store_i32(<16 x i32>* %0, <16 x i32> %1, <16 x i1> %2)
%v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
store <WIDTH x i32> %v1, <WIDTH x i32> * %0
ret void ret void
} }
define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>, define void @__masked_store_blend_float(<WIDTH x float>* nocapture, <WIDTH x float>,
<WIDTH x i1>) nounwind alwaysinline { <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x float> ') %0 call void @__masked_store_float(<16 x float>* %0, <16 x float> %1, <16 x i1> %2)
%v1 = select <WIDTH x i1> %2, <WIDTH x float> %1, <WIDTH x float> %v
store <WIDTH x float> %v1, <WIDTH x float> * %0
ret void ret void
} }
define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture, define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture,
<WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline { <WIDTH x i64>, <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x i64> ') %0 call void @__masked_store_i64(<16 x i64>* %0, <16 x i64> %1, <16 x i1> %2)
%v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
store <WIDTH x i64> %v1, <WIDTH x i64> * %0
ret void ret void
} }
define void @__masked_store_blend_double(<WIDTH x double>* nocapture, define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
<WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline { <WIDTH x double>, <WIDTH x i1>) nounwind alwaysinline {
%v = load PTR_OP_ARGS(`<WIDTH x double> ') %0 call void @__masked_store_double(<16 x double>* %0, <16 x double> %1, <16 x i1> %2)
%v1 = select <WIDTH x i1> %2, <WIDTH x double> %1, <WIDTH x double> %v
store <WIDTH x double> %v1, <WIDTH x double> * %0
ret void ret void
} }

View File

@@ -166,85 +166,26 @@
./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * ./tests/ptr-19.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * ./tests/ptr-22.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 * ./tests/test-143.ispc runfail x86-64 generic-16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/operators2.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 * ./tests/foreach-active-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/acos.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/asin.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/operators2.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/packed-store-1.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/packed-store2-1.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O2 * ./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/acos.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/asin.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O2 *
./tests/short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/paddus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-5.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/paddus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-6.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-7.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-8.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
./tests/short-circuit-9.ispc runfail x86-64 avx512knl-i32x16 Mac LLVM 3.7 clang++3.7 -O0 * ./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\acos.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/psubus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\asin.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/psubus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\memcpy-varying.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\operators2.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\packed-store-1.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\packed-store2-1.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 * ./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.4 -O0 *
.\tests\short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 *
.\tests\short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O2 *
.\tests\acos.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\asin.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-14.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-15.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-5.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-6.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-7.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-8.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
.\tests\short-circuit-9.ispc runfail x86-64 avx512knl-i32x16 Windows LLVM 3.7 cl -O0 *
./tests/operators2.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/rand-distrib.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/shift-1.ispc runfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/foreach-active-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/int64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/int64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/int64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/int64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/rand-distrib-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/rotate.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/shuffle2-5.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/uint64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/uint64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/uint64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/uint64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/idiv.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/int64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/int64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/int64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/int64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/paddus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/paddus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/pmuls_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/pmuls_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/pmulus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/pmulus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/psubus_i64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/psubus_vi64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/reduce-max-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/reduce-max-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/reduce-min-int64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/reduce-min-uint64.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/uint64-max-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/uint64-max.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/uint64-min-1.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/uint64-min.ispc compfail x86 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O0 *
./tests/rand-distrib.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/shift-1.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *
./tests/shuffle2-5.ispc runfail x86-64 avx512knl-i32x16 Linux LLVM 3.7 clang++3.7 -O2 *