added svml support. experimental. for some reason all sybmols are visible..

2013-09-11 15:16:50 +02:00
parent 9c79d4d182
commit 320c41ffcf
17 changed files with 216 additions and 269 deletions
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -154,28 +154,11 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-;;declare <4 x double> @__svml_sin4(<4 x double>)
-;;declare <4 x double> @__svml_cos4(<4 x double>)
-;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
-;;declare <4 x double> @__svml_tan4(<4 x double>)
-;;declare <4 x double> @__svml_atan4(<4 x double>)
-;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
-;;declare <4 x double> @__svml_exp4(<4 x double>)
-;;declare <4 x double> @__svml_log4(<4 x double>)
-;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
-declare <4 x float> @__svml_sin(<4 x float>)
-declare <4 x float> @__svml_cos(<4 x float>)
-declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
-declare <4 x float> @__svml_tan(<4 x float>)
-declare <4 x float> @__svml_atan(<4 x float>)
-declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
-declare <4 x float> @__svml_exp(<4 x float>)
-declare <4 x float> @__svml_log(<4 x float>)
-declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(4)
+svmld_define(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,9 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml

-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_declare(8)
+svmlf_define(8)
+svmld_declare(4)
+svmld_stubs(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
   
 }

-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {

 }

-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
   store <1 x float> %sin, <1 x float> * %1
   store <1 x float> %cos, <1 x float> * %2
   ret void
 }

-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
  ret <1 x float > %0
 }

-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {

 }

-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
  ret <1 x float > %0
 }

-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.exp.f32)
 }

-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.log.f32)
 }

-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  %r = extractelement <1 x float> %0, i32 0
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone

-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...

-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...

-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)


-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_sinf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }

-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
  ret void
 }

-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,11 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmlf_define(4)
+svmld_stubs(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml

 ; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_stubs(8)
+svmld_stubs(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin

 ; FIXME

-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)


-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_sinf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }

-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
  ret void
 }

-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }

-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,11 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff

-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(2)
+svmld_stubs(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3160,6 +3160,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }

 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3175,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
  ret double %r
 }

+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
  %r = call double @cos(double %0)
  ret double %r