Merge pull request #1184 from Shishpan/fix_knl_skx_target_ifelse

Fixed ifelse in rsqrt, rcp def. for knl, skx (compfails with old LLVM).
2016-03-11 16:18:15 +03:00
parent 4f49ac4cb0 7691d961c1
commit 306f3468c3
2 changed files with 42 additions and 38 deletions
--- a/builtins/target-knl.ll
+++ b/builtins/target-knl.ll
@@ -31,7 +31,6 @@

 define(`WIDTH',`16')

-
 ifelse(LLVM_VERSION, LLVM_3_7,
    `include(`target-avx512-common.ll')',
         LLVM_VERSION, LLVM_3_8,
@@ -41,23 +40,27 @@ ifelse(LLVM_VERSION, LLVM_3_7,
  )

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
+;; rcp, rsqrt

+define(`rcp_rsqrt_varying_float_knl',`
 declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
 define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
  ret <16 x float> %res
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
 declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
 define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
  ret <16 x float> %res
 }
+')
+
+ifelse(LLVM_VERSION, LLVM_3_7,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_3_8,
+    rcp_rsqrt_varying_float_knl(),
+         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_knl()
+  )

 ;;saturation_arithmetic_novec()
--- a/builtins/target-skx.ll
+++ b/builtins/target-skx.ll
@@ -39,47 +39,48 @@ ifelse(LLVM_VERSION, LLVM_3_8,
  )

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rcp
-
+;; rcp, rsqrt

+define(`rcp_rsqrt_varying_float_skx',`
 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-
 define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
  %call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
-  ; do one Newton-Raphson iteration to improve precision
-  ;  float iv = __rcp_v(v);
-  ;  return iv * (2. - v * iv);
-  %v_iv = fmul <16 x float> %0, %call
-  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
-                                  float 2., float 2., float 2., float 2.,
-                                  float 2., float 2., float 2., float 2.,
-                                  float 2., float 2., float 2., float 2.>, %v_iv
-  %iv_mul = fmul <16 x float> %call, %two_minus
+  ;; do one Newton-Raphson iteration to improve precision
+  ;;  float iv = __rcp_v(v);
+  ;;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0`,' %call
+  %two_minus = fsub <16 x float> <float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.`,'
+                                  float 2.`,' float 2.`,' float 2.`,' float 2.>`,' %v_iv
+  %iv_mul = fmul <16 x float> %call`,'  %two_minus
  ret <16 x float> %iv_mul
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rsqrt
-
-declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>`,'  <16 x float>`,'  i16) nounwind readnone
 define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
-  %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1)
+  %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v`,'  <16 x float> undef`,'  i16 -1)
  ; Newton-Raphson iteration to improve precision
  ;  float is = __rsqrt_v(v);
  ;  return 0.5 * is * (3. - (v * is) * is);
-  %v_is = fmul <16 x float> %v, %is
-  %v_is_is = fmul <16 x float> %v_is, %is
-  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
-                                  float 3., float 3., float 3., float 3.,
-                                  float 3., float 3., float 3., float 3.,
-                                  float 3., float 3., float 3., float 3.>, %v_is_is
-  %is_mul = fmul <16 x float> %is, %three_sub
-  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
-                                   float 0.5, float 0.5, float 0.5, float 0.5,
-                                   float 0.5, float 0.5, float 0.5, float 0.5,
-                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  %v_is = fmul <16 x float> %v`,'  %is
+  %v_is_is = fmul <16 x float> %v_is`,'  %is
+  %three_sub = fsub <16 x float> <float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.`,'
+                                  float 3.`,' float 3.`,' float 3.`,' float 3.>`,' %v_is_is
+  %is_mul = fmul <16 x float> %is`,'  %three_sub
+  %half_scale = fmul <16 x float> <float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5`,'
+                                   float 0.5`,' float 0.5`,' float 0.5`,' float 0.5>`,' %is_mul
  ret <16 x float> %half_scale
 }
+')
+
+ifelse(LLVM_VERSION, LLVM_3_8,
+    rcp_rsqrt_varying_float_skx(),
+         LLVM_VERSION, LLVM_3_9,
+    rcp_rsqrt_varying_float_skx()
+  )

 ;;saturation_arithmetic_novec()