Merge pull request #1184 from Shishpan/fix_knl_skx_target_ifelse

Fixed ifelse in rsqrt, rcp def. for knl, skx (compfails with old LLVM).
This commit is contained in:
Dmitry Babokin
2016-03-11 16:18:15 +03:00
2 changed files with 42 additions and 38 deletions

View File

@@ -31,7 +31,6 @@
define(`WIDTH',`16')
ifelse(LLVM_VERSION, LLVM_3_7,
`include(`target-avx512-common.ll')',
LLVM_VERSION, LLVM_3_8,
@@ -41,23 +40,27 @@ ifelse(LLVM_VERSION, LLVM_3_7,
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_knl',`
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res
}
')
ifelse(LLVM_VERSION, LLVM_3_7,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_knl()
)
;;saturation_arithmetic_novec()

View File

@@ -39,47 +39,48 @@ ifelse(LLVM_VERSION, LLVM_3_8,
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_skx',`
declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
; do one Newton-Raphson iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0, %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus
;; do one Newton-Raphson iteration to improve precision
;; float iv = __rcp_v(v);
;; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0`,' %call
%two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
%iv_mul = fmul <16 x float> %call`,' %two_minus
ret <16 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,' <16 x float>`,' i16) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1)
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,' <16 x float> undef`,' i16 -1)
; Newton-Raphson iteration to improve precision
; float is = __rsqrt_v(v);
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is
%v_is_is = fmul <16 x float> %v_is, %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
%v_is = fmul <16 x float> %v`,' %is
%v_is_is = fmul <16 x float> %v_is`,' %is
%three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
%is_mul = fmul <16 x float> %is`,' %three_sub
%half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
ret <16 x float> %half_scale
}
')
ifelse(LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_skx(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_skx()
)
;;saturation_arithmetic_novec()