From 153fbc3d7dfbd43da298fcefdf3258d5881d60d4 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Mon, 29 Jul 2013 11:05:05 +0200
Subject: [PATCH] some changes

---
 builtins/target-nvptx64.ll | 35 ++++++++++++++---------------------
 ptxtest/test.ispc          |  8 +++++---
 2 files changed, 19 insertions(+), 24 deletions(-)
diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll
index 030b08c0..f3f9bfd9 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -320,6 +320,9 @@ declare float     @llvm.log.f32(float %Val)
 declare float     @llvm.pow.f32(float %f, float %e)
 
 declare float     @llvm.nvvm.rsqrt.approx.f(float %f) nounwind readonly alwaysinline
+declare float     @llvm.nvvm.sqrt.f(float %f) nounwind readonly alwaysinline
+declare double    @llvm.nvvm.rsqrt.approx.d(double %f) nounwind readonly alwaysinline
+declare double    @llvm.nvvm.sqrt.d(double %f) nounwind readonly alwaysinline
 
 
 
@@ -614,12 +617,10 @@ define  <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; sqrt
 
-define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline {
-  ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0)
-  ;ret <1 x float> %call
-  %d = extractelement <1 x float> %0, i32 0
-  %r = call float @llvm.sqrt.f32(float %d)
-  %rv = insertelement <1 x float> undef, float %r, i32 0
+define  <1 x float> @__sqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
+  %vs = extractelement <1 x float> %v, i32 0
+  %rs = call float @llvm.nvvm.sqrt.f(float %vs)
+  %rv = insertelement <1 x float> undef , float %rs, i32 0
   ret <1 x float> %rv
 }
 
@@ -627,27 +628,19 @@ define  <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysi
 ; rsqrt
 
 
-
 define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline {
-  ;  float is = __rsqrt_v(v);
-  ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v)
-  ; Newton-Raphson iteration to improve precision
-  ;  return 0.5 * is * (3. - (v * is) * is);
-  ;%v_is = fmul <1 x float> %v, %is
-  ;%v_is_is = fmul <1 x float> %v_is, %is
-  ;%three_sub = fsub <1 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
-  ;%is_mul = fmul <1 x float> %is, %three_sub
-  ;%half_scale = fmul <1 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
-  ;ret <1 x float> %half_scale
-  ;%s = call <1 x float> @__sqrt_varying_float(<1 x float> %v)
-  ;%r = call <1 x float> @__rcp_varying_float(<1 x float> %s)
-  ;ret <1 x float> %r
   %vs = extractelement <1 x float> %v, i32 0
   %rs = call float @llvm.nvvm.rsqrt.approx.f(float %vs)
-;  %rs = call float asm "rsqrt.approx.f32 $0,$0", "=f,f"(float %vs)
+;  %rs = call float asm "rsqrt.approx.f32 $0,$0", "=f,f"(float %vs)  ; example of inline ptx
   %rv = insertelement <1 x float> undef , float %rs, i32 0
   ret <1 x float> %rv
 }
+define  <1 x double> @__rsqrt_varying_double(<1 x double> %v) nounwind readonly alwaysinline {
+  %vs = extractelement <1 x double> %v, i32 0
+  %rs = call double @llvm.nvvm.rsqrt.approx.d(double %vs)
+  %rv = insertelement <1 x double> undef , double %rs, i32 0
+  ret <1 x double> %rv
+}
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/ptxtest/test.ispc b/ptxtest/test.ispc
index d5fa2474..b1b55147 100644
--- a/ptxtest/test.ispc
+++ b/ptxtest/test.ispc
@@ -2,9 +2,11 @@ export void saxpy(const uniform float a, const uniform float x_[], const uniform
 {
   foreach (i = 0 ... n)
   {
-     const float x = x_[i];
-     const float y = y_[i];
-     const float z = y + a*x;
+     const double x = x_[i];
+     const double y = y_[i];
+     const double dz = y + a*x;
+     const double dz1 = 1.0/sqrt(dz);
+     const float z = dz1;
      z_[i] = rsqrt(z);
   }
 }