From 153fbc3d7dfbd43da298fcefdf3258d5881d60d4 Mon Sep 17 00:00:00 2001 From: egaburov Date: Mon, 29 Jul 2013 11:05:05 +0200 Subject: [PATCH] some changes --- builtins/target-nvptx64.ll | 35 ++++++++++++++--------------------- ptxtest/test.ispc | 8 +++++--- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index 030b08c0..f3f9bfd9 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -320,6 +320,9 @@ declare float @llvm.log.f32(float %Val) declare float @llvm.pow.f32(float %f, float %e) declare float @llvm.nvvm.rsqrt.approx.f(float %f) nounwind readonly alwaysinline +declare float @llvm.nvvm.sqrt.f(float %f) nounwind readonly alwaysinline +declare double @llvm.nvvm.rsqrt.approx.d(double %f) nounwind readonly alwaysinline +declare double @llvm.nvvm.sqrt.d(double %f) nounwind readonly alwaysinline @@ -614,12 +617,10 @@ define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; sqrt -define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline { - ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0) - ;ret <1 x float> %call - %d = extractelement <1 x float> %0, i32 0 - %r = call float @llvm.sqrt.f32(float %d) - %rv = insertelement <1 x float> undef, float %r, i32 0 +define <1 x float> @__sqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline { + %vs = extractelement <1 x float> %v, i32 0 + %rs = call float @llvm.nvvm.sqrt.f(float %vs) + %rv = insertelement <1 x float> undef , float %rs, i32 0 ret <1 x float> %rv } @@ -627,27 +628,19 @@ define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysi ; rsqrt - define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline { - ; float is = __rsqrt_v(v); - ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v) - ; Newton-Raphson iteration to improve precision - ; return 0.5 * is * (3. - (v * is) * is); - ;%v_is = fmul <1 x float> %v, %is - ;%v_is_is = fmul <1 x float> %v_is, %is - ;%three_sub = fsub <1 x float> , %v_is_is - ;%is_mul = fmul <1 x float> %is, %three_sub - ;%half_scale = fmul <1 x float> , %is_mul - ;ret <1 x float> %half_scale - ;%s = call <1 x float> @__sqrt_varying_float(<1 x float> %v) - ;%r = call <1 x float> @__rcp_varying_float(<1 x float> %s) - ;ret <1 x float> %r %vs = extractelement <1 x float> %v, i32 0 %rs = call float @llvm.nvvm.rsqrt.approx.f(float %vs) -; %rs = call float asm "rsqrt.approx.f32 $0,$0", "=f,f"(float %vs) +; %rs = call float asm "rsqrt.approx.f32 $0,$0", "=f,f"(float %vs) ; example of inline ptx %rv = insertelement <1 x float> undef , float %rs, i32 0 ret <1 x float> %rv } +define <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly alwaysinline { + %vs = extractelement <1 x double> %v, i32 0 + %rs = call double @llvm.nvvm.rsqrt.approx.d(double %vs) + %rv = insertelement <1 x double> undef , double %rs, i32 0 + ret <1 x double> %rv +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/ptxtest/test.ispc b/ptxtest/test.ispc index d5fa2474..b1b55147 100644 --- a/ptxtest/test.ispc +++ b/ptxtest/test.ispc @@ -2,9 +2,11 @@ export void saxpy(const uniform float a, const uniform float x_[], const uniform { foreach (i = 0 ... n) { - const float x = x_[i]; - const float y = y_[i]; - const float z = y + a*x; + const double x = x_[i]; + const double y = y_[i]; + const double dz = y + a*x; + const double dz1 = 1.0/sqrt(dz); + const float z = dz1; z_[i] = rsqrt(z); } }