diff --git a/builtins.cpp b/builtins.cpp index f3a0cf59..886eec15 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -581,6 +581,15 @@ lSetInternalFunctions(llvm::Module *module) { "__stdlib_sinf", "__stdlib_tan", "__stdlib_tanf", + "__svml_sin", + "__svml_cos", + "__svml_sincos", + "__svml_tan", + "__svml_atan", + "__svml_atan2", + "__svml_exp", + "__svml_log", + "__svml_pow", "__undef_uniform", "__undef_varying", "__vec4_add_float", @@ -1050,6 +1059,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, module, symbolTable); + lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, + symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 8fb2e427..d9e0322b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -134,6 +134,23 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always ret <16 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones 4x with our 16-wide +; vectors... + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index adaed9ba..90e2f3ac 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -134,6 +134,23 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 3472c207..31ebcdd5 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -647,6 +647,104 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.sin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.sin.f32) + +} + +define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.cos.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float, @llvm.cos.f32) + +} + +define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) +; store <1 x float> %s, <1 x float> * %1 +; ret void + %sin = call <1 x float> @__svml_sin (<1 x float> %0) + %cos = call <1 x float> @__svml_cos (<1 x float> %0) + store <1 x float> %sin, <1 x float> * %1 + store <1 x float> %cos, <1 x float> * %2 + ret void +} + +define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_tan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unasry1to1(float, @llvm.tan.f32) + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) +; ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_atan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unsary1to1(float,@llvm.atan.f32) + ;UNSUPPORTED! + ret <1 x float > %0 + +} + +define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + ;%y = extractelement <1 x float> %0, i32 0 + ;%x = extractelement <1 x float> %1, i32 0 + ;%q = fdiv float %y, %x + ;%a = call float @llvm.atan.f32 (float %q) + ;%rv = insertelement <1 x float> undef, float %a, i32 0 + ;ret <1 x float> %rv + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.exp.f32) +} + +define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.log.f32) +} + +define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + %r = extractelement <1 x float> %0, i32 0 + %e = extractelement <1 x float> %1, i32 0 + %s = call float @llvm.pow.f32(float %r,float %e) + %rv = insertelement <1 x float> undef, float %s, i32 0 + ret <1 x float> %rv + +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index c683ff45..2896c6b1 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -202,6 +202,22 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone declare i32 @__count_leading_zeros_i32(i32) nounwind readnone declare i64 @__count_leading_zeros_i64(i64) nounwind readnone +;; svml + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reductions diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll index f892a0a1..696b0748 100644 --- a/builtins/target-neon-common.ll +++ b/builtins/target-neon-common.ll @@ -313,6 +313,19 @@ define void @__masked_store_blend_i64(* nocapture %ptr, ret void } +;; yuck. We need declarations of these, even though we shouldnt ever +;; actually generate calls to them for the NEON target... + +declare @__svml_sin() +declare @__svml_cos() +declare void @__svml_sincos(, *, *) +declare @__svml_tan() +declare @__svml_atan() +declare @__svml_atan2(, ) +declare @__svml_exp() +declare @__svml_log() +declare @__svml_pow(, ) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 057ea98f..da22a66c 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_sinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_cosf4, %0) + ret <8 x float> %ret +} + +define void @__svml_sincos(<8 x float>, <8 x float> *, + <8 x float> *) nounwind readnone alwaysinline { + ; call svml_sincosf4 two times with the two 4-wide sub-vectors + %a = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + %b = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + + %cospa = alloca <4 x float> + %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) + + %cospb = alloca <4 x float> + %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) + + %sin = shufflevector <4 x float> %sa, <4 x float> %sb, + <8 x i32> + store <8 x float> %sin, <8 x float> * %1 + + %cosa = load <4 x float> * %cospa + %cosb = load <4 x float> * %cospb + %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, + <8 x i32> + store <8 x float> %cos, <8 x float> * %2 + + ret void +} + +define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_tanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_atanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan2(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_atan2f4, %0, %1) + ret <8 x float> %ret +} + +define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_expf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_logf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_pow(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_powf4, %0, %1) + ret <8 x float> %ret +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e0a5c3d5..a6b206b6 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -493,6 +493,66 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin ret <4 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index b4772552..d7f3833d 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -205,6 +205,21 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ret <8 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <8 x float> @__svml_sin(<8 x float>) +declare <8 x float> @__svml_cos(<8 x float>) +declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *) +declare <8 x float> @__svml_tan(<8 x float>) +declare <8 x float> @__svml_atan(<8 x float>) +declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>) +declare <8 x float> @__svml_exp(<8 x float>) +declare <8 x float> @__svml_log(<8 x float>) +declare <8 x float> @__svml_pow(<8 x float>, <8 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index a75d8e3a..fd4b74d7 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -217,6 +217,21 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ret <16 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; svml + +; FIXME + +declare <16 x float> @__svml_sin(<16 x float>) +declare <16 x float> @__svml_cos(<16 x float>) +declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *) +declare <16 x float> @__svml_tan(<16 x float>) +declare <16 x float> @__svml_atan(<16 x float>) +declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>) +declare <16 x float> @__svml_exp(<16 x float>) +declare <16 x float> @__svml_log(<16 x float>) +declare <16 x float> @__svml_pow(<16 x float>, <16 x float>) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 897a09eb..a7faddb3 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -102,6 +102,92 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin ret <8 x float> %call } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_sinf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_cosf4, %0) + ret <8 x float> %ret +} + +define void @__svml_sincos(<8 x float>, <8 x float> *, + <8 x float> *) nounwind readnone alwaysinline { + ; call svml_sincosf4 two times with the two 4-wide sub-vectors + %a = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + %b = shufflevector <8 x float> %0, <8 x float> undef, + <4 x i32> + + %cospa = alloca <4 x float> + %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a) + + %cospb = alloca <4 x float> + %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b) + + %sin = shufflevector <4 x float> %sa, <4 x float> %sb, + <8 x i32> + store <8 x float> %sin, <8 x float> * %1 + + %cosa = load <4 x float> * %cospa + %cosb = load <4 x float> * %cospb + %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb, + <8 x i32> + store <8 x float> %cos, <8 x float> * %2 + + ret void +} + +define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_tanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_atanf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_atan2(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_atan2f4, %0, %1) + ret <8 x float> %ret +} + +define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_expf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline { + unary4to8(ret, float, @__svml_logf4, %0) + ret <8 x float> %ret +} + +define <8 x float> @__svml_pow(<8 x float>, + <8 x float>) nounwind readnone alwaysinline { + binary4to8(ret, float, @__svml_powf4, %0, %1) + ret <8 x float> %ret +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 5429b461..e05b865f 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -206,6 +206,66 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone +declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone +declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone + + +define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_sinf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_cosf4(<4 x float> %0) + ret <4 x float> %ret +} + +define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline { + %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0) + store <4 x float> %s, <4 x float> * %1 + ret void +} + +define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_tanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atanf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + +define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_expf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_logf4(<4 x float> %0) + ret <4 x float> %ret +} + +define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline { + %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1) + ret <4 x float> %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions diff --git a/docs/ispc.rst b/docs/ispc.rst index 476046e8..ff07f6d8 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3333,6 +3333,9 @@ for this argument. approximately 1.45e-6 over the range -10pi to 10pi.) * ``fast``: more efficient but lower accuracy versions of the default ``ispc`` implementations. +* ``svml``: use Intel "Short Vector Math Library". Use + ``icc`` to link your final executable so that the appropriate libraries + are linked. * ``system``: use the system's math library. On many systems, these functions are more accurate than both of ``ispc``'s implementations. Using these functions may be quite diff --git a/ispc.h b/ispc.h index fc78e415..4804832f 100644 --- a/ispc.h +++ b/ispc.h @@ -488,7 +488,7 @@ struct Globals { /** There are a number of math libraries that can be used for transcendentals and the like during program compilation. */ - enum MathLib { Math_ISPC, Math_ISPCFast, Math_System }; + enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; MathLib mathLib; /** Records whether the ispc standard library should be made available diff --git a/main.cpp b/main.cpp index 61c62042..21a47de8 100644 --- a/main.cpp +++ b/main.cpp @@ -112,6 +112,7 @@ usage(int ret) { printf(" [--math-lib=