Fixed ifelse in rsqrt, rcp def. for knl, skx (compfails with old LLVM).

This commit is contained in:
Andrey Shishpanov
2016-03-10 23:59:32 +03:00
parent 4f49ac4cb0
commit 7691d961c1
2 changed files with 42 additions and 38 deletions

View File

@@ -31,7 +31,6 @@
define(`WIDTH',`16') define(`WIDTH',`16')
ifelse(LLVM_VERSION, LLVM_3_7, ifelse(LLVM_VERSION, LLVM_3_7,
`include(`target-avx512-common.ll')', `include(`target-avx512-common.ll')',
LLVM_VERSION, LLVM_3_8, LLVM_VERSION, LLVM_3_8,
@@ -41,23 +40,27 @@ ifelse(LLVM_VERSION, LLVM_3_7,
) )
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_knl',`
declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8) %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res ret <16 x float> %res
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8) %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
ret <16 x float> %res ret <16 x float> %res
} }
')
ifelse(LLVM_VERSION, LLVM_3_7,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_knl(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_knl()
)
;;saturation_arithmetic_novec() ;;saturation_arithmetic_novec()

View File

@@ -39,47 +39,48 @@ ifelse(LLVM_VERSION, LLVM_3_8,
) )
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp, rsqrt
define(`rcp_rsqrt_varying_float_skx',`
declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
%call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1) %call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
; do one Newton-Raphson iteration to improve precision ;; do one Newton-Raphson iteration to improve precision
; float iv = __rcp_v(v); ;; float iv = __rcp_v(v);
; return iv * (2. - v * iv); ;; return iv * (2. - v * iv);
%v_iv = fmul <16 x float> %0, %call %v_iv = fmul <16 x float> %0`,' %call
%two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2., %two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2., float 2., float 2., float 2., float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2., float 2., float 2., float 2., float 2.`,' float 2.`,' float 2.`,' float 2.`,'
float 2., float 2., float 2., float 2.>, %v_iv float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
%iv_mul = fmul <16 x float> %call, %two_minus %iv_mul = fmul <16 x float> %call`,' %two_minus
ret <16 x float> %iv_mul ret <16 x float> %iv_mul
} }
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,' <16 x float>`,' i16) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
%is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1) %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,' <16 x float> undef`,' i16 -1)
; Newton-Raphson iteration to improve precision ; Newton-Raphson iteration to improve precision
; float is = __rsqrt_v(v); ; float is = __rsqrt_v(v);
; return 0.5 * is * (3. - (v * is) * is); ; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul <16 x float> %v, %is %v_is = fmul <16 x float> %v`,' %is
%v_is_is = fmul <16 x float> %v_is, %is %v_is_is = fmul <16 x float> %v_is`,' %is
%three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3., %three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3., float 3., float 3., float 3., float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3., float 3., float 3., float 3., float 3.`,' float 3.`,' float 3.`,' float 3.`,'
float 3., float 3., float 3., float 3.>, %v_is_is float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
%is_mul = fmul <16 x float> %is, %three_sub %is_mul = fmul <16 x float> %is`,' %three_sub
%half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5, %half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5, float 0.5, float 0.5, float 0.5, float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5, float 0.5, float 0.5, float 0.5, float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
ret <16 x float> %half_scale ret <16 x float> %half_scale
} }
')
ifelse(LLVM_VERSION, LLVM_3_8,
rcp_rsqrt_varying_float_skx(),
LLVM_VERSION, LLVM_3_9,
rcp_rsqrt_varying_float_skx()
)
;;saturation_arithmetic_novec() ;;saturation_arithmetic_novec()