Fixed ifelse in rsqrt, rcp def. for knl, skx (compfails with old LLVM).

This commit is contained in:
Andrey Shishpanov
2016-03-10 23:59:32 +03:00
parent 4f49ac4cb0
commit 7691d961c1
2 changed files with 42 additions and 38 deletions

View File

@@ -31,7 +31,6 @@
define(`WIDTH',`16')
ifelse(LLVM_VERSION, LLVM_3_7,
`include(`target-avx512-common.ll')',
LLVM_VERSION, LLVM_3_8,
@@ -41,23 +40,27 @@ ifelse(LLVM_VERSION, LLVM_3_7,
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_knl',`
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res
}
')
ifelse(LLVM_VERSION, LLVM_3_7,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_knl()
)
;;saturation_arithmetic_novec()

View File

@@ -39,47 +39,48 @@ ifelse(LLVM_VERSION, LLVM_3_8,
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_skx',`
declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
; do one Newton-Raphson iteration to improve precision
; float iv = __rcp_v(v);
; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0, %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.,
float 2., float 2., float 2., float 2.>, %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus
;; do one Newton-Raphson iteration to improve precision
;; float iv = __rcp_v(v);
;; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0`,' %call
%two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
%iv_mul = fmul <16 x float> %call`,' %two_minus
ret <16 x float> %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,' <16 x float>`,' i16) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1)
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,' <16 x float> undef`,' i16 -1)
; Newton-Raphson iteration to improve precision
; float is = __rsqrt_v(v);
; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is
%v_is_is = fmul <16 x float> %v_is, %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.,
float 3., float 3., float 3., float 3.>, %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5,
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
%v_is = fmul <16 x float> %v`,' %is
%v_is_is = fmul <16 x float> %v_is`,' %is
%three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
%is_mul = fmul <16 x float> %is`,' %three_sub
%half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
ret <16 x float> %half_scale
}
')
ifelse(LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_skx(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_skx()
)
;;saturation_arithmetic_novec()