Fixed ifelse in rsqrt, rcp def. for knl, skx (compfails with old LLVM).
This commit is contained in:
@@ -31,7 +31,6 @@
|
||||
|
||||
define(`WIDTH',`16')
|
||||
|
||||
|
||||
ifelse(LLVM_VERSION, LLVM_3_7,
|
||||
`include(`target-avx512-common.ll')',
|
||||
LLVM_VERSION, LLVM_3_8,
|
||||
@@ -41,23 +40,27 @@ ifelse(LLVM_VERSION, LLVM_3_7,
|
||||
)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
;; rcp, rsqrt
|
||||
|
||||
define(`rcp_rsqrt_varying_float_knl',`
|
||||
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
|
||||
|
||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, LLVM_3_7,
|
||||
rcp_rsqrt_varying_float_knl(),
|
||||
LLVM_VERSION, LLVM_3_8,
|
||||
rcp_rsqrt_varying_float_knl(),
|
||||
LLVM_VERSION, LLVM_3_9,
|
||||
rcp_rsqrt_varying_float_knl()
|
||||
)
|
||||
|
||||
;;saturation_arithmetic_novec()
|
||||
|
||||
@@ -39,47 +39,48 @@ ifelse(LLVM_VERSION, LLVM_3_8,
|
||||
)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
;; rcp, rsqrt
|
||||
|
||||
define(`rcp_rsqrt_varying_float_skx',`
|
||||
declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
|
||||
|
||||
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
|
||||
%call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
|
||||
; do one Newton-Raphson iteration to improve precision
|
||||
; float iv = __rcp_v(v);
|
||||
; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <16 x float> %0, %call
|
||||
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.,
|
||||
float 2., float 2., float 2., float 2.>, %v_iv
|
||||
%iv_mul = fmul <16 x float> %call, %two_minus
|
||||
;; do one Newton-Raphson iteration to improve precision
|
||||
;; float iv = __rcp_v(v);
|
||||
;; return iv * (2. - v * iv);
|
||||
%v_iv = fmul <16 x float> %0`,' %call
|
||||
%two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
|
||||
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
|
||||
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
|
||||
float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
|
||||
%iv_mul = fmul <16 x float> %call`,' %two_minus
|
||||
ret <16 x float> %iv_mul
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rsqrt
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,' <16 x float>`,' i16) nounwind readnone
|
||||
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
|
||||
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1)
|
||||
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,' <16 x float> undef`,' i16 -1)
|
||||
; Newton-Raphson iteration to improve precision
|
||||
; float is = __rsqrt_v(v);
|
||||
; return 0.5 * is * (3. - (v * is) * is);
|
||||
%v_is = fmul <16 x float> %v, %is
|
||||
%v_is_is = fmul <16 x float> %v_is, %is
|
||||
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.,
|
||||
float 3., float 3., float 3., float 3.>, %v_is_is
|
||||
%is_mul = fmul <16 x float> %is, %three_sub
|
||||
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5,
|
||||
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
|
||||
%v_is = fmul <16 x float> %v`,' %is
|
||||
%v_is_is = fmul <16 x float> %v_is`,' %is
|
||||
%three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
|
||||
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
|
||||
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
|
||||
float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
|
||||
%is_mul = fmul <16 x float> %is`,' %three_sub
|
||||
%half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
|
||||
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
|
||||
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
|
||||
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
|
||||
ret <16 x float> %half_scale
|
||||
}
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, LLVM_3_8,
|
||||
rcp_rsqrt_varying_float_skx(),
|
||||
LLVM_VERSION, LLVM_3_9,
|
||||
rcp_rsqrt_varying_float_skx()
|
||||
)
|
||||
|
||||
;;saturation_arithmetic_novec()
|
||||
|
||||
Reference in New Issue
Block a user