From 3a72e05c3e8cfacc797a281424cb0b1e95668904 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Sun, 2 Feb 2014 18:16:48 +0100
Subject: [PATCH 01/13] +1

---
 builtins.cpp                      |  4 ++++
 builtins/target-avx-x2.ll         |  7 +++++++
 builtins/target-avx.ll            |  6 ++++++
 builtins/target-generic-1.ll      |  6 ++++++
 builtins/target-generic-common.ll |  4 ++++
 builtins/target-neon-16.ll        |  6 ++++++
 builtins/target-neon-32.ll        |  6 ++++++
 builtins/target-neon-8.ll         |  6 ++++++
 builtins/target-sse2-x2.ll        |  6 ++++++
 builtins/target-sse2.ll           |  6 ++++++
 builtins/target-sse4-16.ll        |  7 +++++++
 builtins/target-sse4-8.ll         |  7 +++++++
 builtins/target-sse4-x2.ll        |  6 ++++++
 builtins/target-sse4.ll           |  6 ++++++
 builtins/util.m4                  | 34 +++++++++++++++++++++++++++++++
 stdlib.ispc                       |  9 ++++++++
 16 files changed, 126 insertions(+)
diff --git a/builtins.cpp b/builtins.cpp
index d6eefde5..da66ed5f 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__prefetch_read_uniform_nt",
         "__rcp_uniform_float",
         "__rcp_varying_float",
+        "__rcp_uniform_double",
+        "__rcp_varying_double",
         "__rdrand_i16",
         "__rdrand_i32",
         "__rdrand_i64",
@@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__round_varying_float",
         "__rsqrt_uniform_float",
         "__rsqrt_varying_float",
+        "__rsqrt_uniform_double",
+        "__rsqrt_varying_double",
         "__set_system_isa",
         "__sext_uniform_bool",
         "__sext_varying_bool",
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index f8fd5cd5..802f726b 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
   binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
   ret <16 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e98a3843..2abed8fe 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -559,3 +559,9 @@ gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index c43a12a7..40d7ae8c 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -992,3 +992,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
 
 define_avgs()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2b2b21c9..4f87ad26 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -191,9 +191,13 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
 
 declare float @__rsqrt_uniform_float(float) nounwind readnone 
 declare float @__rcp_uniform_float(float) nounwind readnone 
+declare double @__rsqrt_uniform_double(double) nounwind readnone 
+declare double @__rcp_uniform_double(double) nounwind readnone 
 declare float @__sqrt_uniform_float(float) nounwind readnone 
 declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
 
 declare double @__sqrt_uniform_double(double) nounwind readnone
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
index a0575927..0f144b66 100644
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
   %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
   ret <8 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll
index 30b062c9..d30efff2 100644
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
   ret <4 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
index 2accfe53..4b6dfa3a 100644
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
   v8tov16(i16, %r0, %r1, %r)
   ret <16 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 77bf1a9d..41b4be09 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
   ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index e42d4990..ac091cdb 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -587,3 +587,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 72b81ff0..55c94ce6 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -488,3 +488,10 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
 define_avg_up_int8()
 define_avg_up_int16()
 define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 69b355e3..4d81df92 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -490,3 +490,10 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
 define_avg_up_int8()
 define_avg_up_int16()
 define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 842db53f..09088dc3 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -592,3 +592,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 define_avgs()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 16177b47..747a6a58 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -515,3 +515,9 @@ gen_scatter(double)
 
 define_avgs()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrt_double()
+rcp_double()
+
diff --git a/builtins/util.m4 b/builtins/util.m4
index f9ae7cd1..a8340adb 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4531,3 +4531,37 @@ define(`define_avgs', `
 define_up_avgs()
 define_down_avgs()
 ')
+
+define(`rsqrt_double', `
+define double @__rsqrt_uniform_double(double) nounwind alwaysinline readnone
+{
+  %flt = fptrunc double %0 to float
+  %res = call float @__rsqrt_uniform_float(float %flt)
+  %dres = fpext float %res to double
+  ret double %dres
+}
+define <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind alwaysinline readnone
+{
+  %flt = fptrunc <WIDTH x double> %0 to <WIDTH x float>
+  %res = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %flt)
+  %dres = fpext <WIDTH x float> %res to <WIDTH x double>
+  ret <WIDTH x double> %dres
+}
+')
+
+define(`rcp_double', `
+define double @__rcp_uniform_double(double) nounwind alwaysinline readnone
+{
+  %flt = fptrunc double %0 to float
+  %res = call float @__rcp_uniform_float(float %flt)
+  %dres = fpext float %res to double
+  ret double %dres
+}
+define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind alwaysinline readnone
+{
+  %flt = fptrunc <WIDTH x double> %0 to <WIDTH x float>
+  %res = call <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %flt)
+  %dres = fpext <WIDTH x float> %res to <WIDTH x double>
+  ret <WIDTH x double> %dres
+}
+')
diff --git a/stdlib.ispc b/stdlib.ispc
index 3b17283d..de2221b2 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -3517,6 +3517,15 @@ static inline uniform double sqrt(uniform double v) {
     return __sqrt_uniform_double(v);
 }
 
+__declspec(safe)
+static inline double rsqrt(double v) {
+    return __rsqrt_varying_double(v);
+}
+
+__declspec(safe)
+static inline uniform double rsqrt(uniform double v) {
+    return __rsqrt_uniform_double(v);
+}
 __declspec(safe)
 static inline double ldexp(double x, int n) {
     unsigned int64 ex = 0x7ff0000000000000;

From 4515dd5c89e6b3ceecfe8d9eb5bffa7a519aaac8 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Sun, 2 Feb 2014 18:19:56 +0100
Subject: [PATCH 02/13] added tests for rcp/rsqrt double

---
 tests/test-147.ispc | 16 ++++++++++++++++
 tests/test-148.ispc | 16 ++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 tests/test-147.ispc
 create mode 100644 tests/test-148.ispc

diff --git a/tests/test-147.ispc b/tests/test-147.ispc
new file mode 100644
index 00000000..4091a78d
--- /dev/null
+++ b/tests/test-147.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double x = aFOO[programIndex&0x3]; 
+    double d, ix; 
+    ix = rcp(x);
+    d = (ix - 1.0d0 / x);
+    d = (d < 0.0d0) ? -d : d;
+    RET[programIndex] = (d < 1d-7) ? 1.0d0 : 0.0d0; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.0d0;
+}
diff --git a/tests/test-148.ispc b/tests/test-148.ispc
new file mode 100644
index 00000000..f8284e1a
--- /dev/null
+++ b/tests/test-148.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double x = aFOO[programIndex]; 
+    double d, invsqrt = rsqrt(x);
+    d = (x * (invsqrt * invsqrt)) - 1.0d0;
+    if (d < 0.0d0) d = -d;
+    RET[programIndex] = d > 1d-5;
+}
+
+
+export void result(uniform float RET[]) {
+       RET[programIndex] = 0;
+}

From b0753dc93d0dfb31e7af74a25f13dd656161bd73 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Sun, 2 Feb 2014 18:20:05 +0100
Subject: [PATCH 03/13] added double-version for rcp

---
 stdlib.ispc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/stdlib.ispc b/stdlib.ispc
index de2221b2..42b60b52 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1391,6 +1391,16 @@ static inline uniform float rcp(uniform float v) {
     return __rcp_uniform_float(v);
 }
 
+__declspec(safe) 
+static inline double rcp(double v) {
+    return __rcp_varying_double(v);
+}
+
+__declspec(safe) 
+static inline uniform double rcp(uniform double v) {
+    return __rcp_uniform_double(v);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // min/max
 

From eb1a495a7aab85773fdfd5cad37e29c54d380460 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Tue, 4 Feb 2014 14:44:54 +0100
Subject: [PATCH 04/13] added support for fast approximate rsqrt(double).
 Provide 16 digit accurancy but is over 3x faster than 1/sqrt(double)

---
 builtins/target-avx1-i64x4base.ll |  2 ++
 builtins/target-generic-common.ll |  9 ++++----
 builtins/util.m4                  | 22 +++++++++----------
 stdlib.ispc                       | 35 +++++++++++++++++++++++++------
 tests/test-148.ispc               |  4 ++--
 5 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index e1832030..1b1e7c4f 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %call
 }
 
+rsqrt_double()
+rcp_double()
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 4f87ad26..dfdc7e9e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -191,13 +191,14 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
 
 declare float @__rsqrt_uniform_float(float) nounwind readnone 
 declare float @__rcp_uniform_float(float) nounwind readnone 
-declare double @__rsqrt_uniform_double(double) nounwind readnone 
-declare double @__rcp_uniform_double(double) nounwind readnone 
+declare double @__rsqrt_uniform_double(double, <WIDTH x MASK>) nounwind readnone 
+declare double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind readnone 
 declare float @__sqrt_uniform_float(float) nounwind readnone 
 declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
-declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone 
-declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind readnone 
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind readnone 
+
 declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
 
 declare double @__sqrt_uniform_double(double) nounwind readnone
diff --git a/builtins/util.m4 b/builtins/util.m4
index a8340adb..3915bc4e 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4533,31 +4533,29 @@ define_down_avgs()
 ')
 
 define(`rsqrt_double', `
-define double @__rsqrt_uniform_double(double) nounwind alwaysinline readnone
+declare double @__rsqrt_safe_uniform_double___und(double, <WIDTH x MASK>)
+define  double @__rsqrt_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
-  %flt = fptrunc double %0 to float
-  %res = call float @__rsqrt_uniform_float(float %flt)
-  %dres = fpext float %res to double
-  ret double %dres
+  %res = call double @__rsqrt_safe_uniform_double___und(double %0, <WIDTH x MASK> %1)
+  ret double %res
 }
-define <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind alwaysinline readnone
+declare <WIDTH x double> @__rsqrt_safe_varying_double___vyd(<WIDTH x double>, <WIDTH x MASK>) 
+define  <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
-  %flt = fptrunc <WIDTH x double> %0 to <WIDTH x float>
-  %res = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %flt)
-  %dres = fpext <WIDTH x float> %res to <WIDTH x double>
-  ret <WIDTH x double> %dres
+  %res = call <WIDTH x double> @__rsqrt_safe_varying_double___vyd(<WIDTH x double> %0, <WIDTH x MASK> %1)
+  ret <WIDTH x double> %res
 }
 ')
 
 define(`rcp_double', `
-define double @__rcp_uniform_double(double) nounwind alwaysinline readnone
+define double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
   %flt = fptrunc double %0 to float
   %res = call float @__rcp_uniform_float(float %flt)
   %dres = fpext float %res to double
   ret double %dres
 }
-define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind alwaysinline readnone
+define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
   %flt = fptrunc <WIDTH x double> %0 to <WIDTH x float>
   %res = call <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %flt)
diff --git a/stdlib.ispc b/stdlib.ispc
index 42b60b52..70b5f0d1 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1393,12 +1393,12 @@ static inline uniform float rcp(uniform float v) {
 
 __declspec(safe) 
 static inline double rcp(double v) {
-    return __rcp_varying_double(v);
+    return __rcp_varying_double(v, (IntMaskType)__mask);
 }
 
 __declspec(safe) 
 static inline uniform double rcp(uniform double v) {
-    return __rcp_uniform_double(v);
+    return __rcp_uniform_double(v, (IntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -3527,14 +3527,37 @@ static inline uniform double sqrt(uniform double v) {
     return __sqrt_uniform_double(v);
 }
 
-__declspec(safe)
-static inline double rsqrt(double v) {
-    return __rsqrt_varying_double(v);
+#define RSQRTD(QUAL)  \
+__declspec(safe)    \
+static inline QUAL double __rsqrt_iterate_double(QUAL double x, QUAL double y)   \
+{   \
+  QUAL double xh = x*0.5d;    \
+  y += y*(0.5d0 - xh*y*y);    \
+  y += y*(0.5d0 - xh*y*y);    \
+  return y;   \
+}   \
+__declspec(safe)    \
+static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x)    \
+{   \
+  if (x <= 1.0d+33 && x >= 1.0d-33)   \
+    return __rsqrt_iterate_double(x, rsqrt((QUAL float)x));   \
+  QUAL int64  ex   = intbits(x) & 0x7fe0000000000000;                 \
+  QUAL double exp  = doublebits(  0x7fd0000000000000 -  ex      );   /* 1.0d/exponent  */   \
+  QUAL double exph = doublebits(  0x5fe0000000000000 - (ex >> 1));   /* 1.0d/sqrt(exponent) */    \
+  QUAL double   y  = rsqrt((QUAL float)(x*exp));          \
+  return __rsqrt_iterate_double(x, y*exph);    \
 }
 
+RSQRTD(varying)
+__declspec(safe)   
+static inline double rsqrt(double v) {   
+  return __rsqrt_varying_double(v, (IntMaskType)__mask);   
+}   
+
+RSQRTD(uniform)
 __declspec(safe)
 static inline uniform double rsqrt(uniform double v) {
-    return __rsqrt_uniform_double(v);
+    return __rsqrt_uniform_double(v, (IntMaskType)__mask);
 }
 __declspec(safe)
 static inline double ldexp(double x, int n) {
diff --git a/tests/test-148.ispc b/tests/test-148.ispc
index f8284e1a..1c01428c 100644
--- a/tests/test-148.ispc
+++ b/tests/test-148.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double x = aFOO[programIndex]; 
+    double x = aFOO[programIndex]*1d100;
     double d, invsqrt = rsqrt(x);
     d = (x * (invsqrt * invsqrt)) - 1.0d0;
     if (d < 0.0d0) d = -d;
-    RET[programIndex] = d > 1d-5;
+    RET[programIndex] = d > 1d-15;
 }
 
 

From fe98fe8cdcca7c2adedeba9fbfff00c3cdeb19e2 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Tue, 4 Feb 2014 15:23:34 +0100
Subject: [PATCH 05/13] added fast approximate rcp(double) accurate to 15
 digits

---
 builtins/util.m4    | 18 ++++++++----------
 stdlib.ispc         | 27 ++++++++++++++++++++++++---
 tests/test-147.ispc |  4 ++--
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 3915bc4e..74b5cdd3 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4548,18 +4548,16 @@ define  <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>, <WIDTH x MASK
 ')
 
 define(`rcp_double', `
-define double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
+declare double @__rcp_safe_uniform_double___und(double, <WIDTH x MASK>)
+define  double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
-  %flt = fptrunc double %0 to float
-  %res = call float @__rcp_uniform_float(float %flt)
-  %dres = fpext float %res to double
-  ret double %dres
+  %res = call double @__rcp_safe_uniform_double___und(double %0, <WIDTH x MASK> %1)
+  ret double %res
 }
-define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
+declare <WIDTH x double> @__rcp_safe_varying_double___vyd(<WIDTH x double>, <WIDTH x MASK>) 
+define  <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
 {
-  %flt = fptrunc <WIDTH x double> %0 to <WIDTH x float>
-  %res = call <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %flt)
-  %dres = fpext <WIDTH x float> %res to <WIDTH x double>
-  ret <WIDTH x double> %dres
+  %res = call <WIDTH x double> @__rcp_safe_varying_double___vyd(<WIDTH x double> %0, <WIDTH x MASK> %1)
+  ret <WIDTH x double> %res
 }
 ')
diff --git a/stdlib.ispc b/stdlib.ispc
index 70b5f0d1..bd12034c 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1391,11 +1391,32 @@ static inline uniform float rcp(uniform float v) {
     return __rcp_uniform_float(v);
 }
 
+#define RCPD(QUAL) \
+__declspec(safe)  \
+static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
+{ \
+  iv = iv * (2.0d - v*iv); \
+  iv = iv * (2.0d - v*iv); \
+  return iv; \
+} \
+__declspec(safe)  \
+static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
+{ \
+  if (x <= 1.0d+33 && x >= 1.0d-33) \
+    return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
+  QUAL int64  ex   = intbits(x) & 0x7fe0000000000000; \
+  QUAL double exp  = doublebits(  0x7fd0000000000000 + ~ex      );   \
+  QUAL double   y  = rcp((QUAL float)(x*exp)); \
+  return __rcp_iterate_##QUAL##_double(x, y*exp); \
+} 
+
+RCPD(varying)
 __declspec(safe) 
 static inline double rcp(double v) {
     return __rcp_varying_double(v, (IntMaskType)__mask);
 }
 
+RCPD(uniform)
 __declspec(safe) 
 static inline uniform double rcp(uniform double v) {
     return __rcp_uniform_double(v, (IntMaskType)__mask);
@@ -3529,7 +3550,7 @@ static inline uniform double sqrt(uniform double v) {
 
 #define RSQRTD(QUAL)  \
 __declspec(safe)    \
-static inline QUAL double __rsqrt_iterate_double(QUAL double x, QUAL double y)   \
+static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y)   \
 {   \
   QUAL double xh = x*0.5d;    \
   y += y*(0.5d0 - xh*y*y);    \
@@ -3540,12 +3561,12 @@ __declspec(safe)    \
 static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x)    \
 {   \
   if (x <= 1.0d+33 && x >= 1.0d-33)   \
-    return __rsqrt_iterate_double(x, rsqrt((QUAL float)x));   \
+    return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x));   \
   QUAL int64  ex   = intbits(x) & 0x7fe0000000000000;                 \
   QUAL double exp  = doublebits(  0x7fd0000000000000 -  ex      );   /* 1.0d/exponent  */   \
   QUAL double exph = doublebits(  0x5fe0000000000000 - (ex >> 1));   /* 1.0d/sqrt(exponent) */    \
   QUAL double   y  = rsqrt((QUAL float)(x*exp));          \
-  return __rsqrt_iterate_double(x, y*exph);    \
+  return __rsqrt_iterate_##QUAL##_double(x, y*exph);    \
 }
 
 RSQRTD(varying)
diff --git a/tests/test-147.ispc b/tests/test-147.ispc
index 4091a78d..5c5d4bc1 100644
--- a/tests/test-147.ispc
+++ b/tests/test-147.ispc
@@ -3,12 +3,12 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double x = aFOO[programIndex&0x3]; 
+    double x = aFOO[programIndex&0x3]*1d100;
     double d, ix; 
     ix = rcp(x);
     d = (ix - 1.0d0 / x);
     d = (d < 0.0d0) ? -d : d;
-    RET[programIndex] = (d < 1d-7) ? 1.0d0 : 0.0d0; 
+    RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0; 
 }
 
 export void result(uniform float RET[]) {

From d3a6693eef93b97a91fde0bcf20388aa95921b6e Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Tue, 4 Feb 2014 16:29:23 +0100
Subject: [PATCH 06/13] adding __have_native_{rsqrtd,rcpd} to select between
 native support for double precision reciprocals and using slower but safe
 version in stdlib

---
 builtins.cpp                      |  4 ++++
 builtins/target-generic-common.ll |  8 ++++----
 builtins/util.m4                  | 28 ++++------------------------
 ispc.cpp                          |  9 ++++++++-
 ispc.h                            | 10 ++++++++++
 stdlib.ispc                       | 27 ++++++++++++++++++++-------
 6 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index da66ed5f..fee322e7 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1150,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
                        module, symbolTable);
+    lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(),
+                       module, symbolTable);
+    lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
+                       module, symbolTable);
 
     if (g->forceAlignment != -1) {
         llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index dfdc7e9e..401c862d 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -191,13 +191,13 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
 
 declare float @__rsqrt_uniform_float(float) nounwind readnone 
 declare float @__rcp_uniform_float(float) nounwind readnone 
-declare double @__rsqrt_uniform_double(double, <WIDTH x MASK>) nounwind readnone 
-declare double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind readnone 
+declare double @__rsqrt_uniform_double(double) nounwind readnone 
+declare double @__rcp_uniform_double(double) nounwind readnone 
 declare float @__sqrt_uniform_float(float) nounwind readnone 
 declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
-declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind readnone 
-declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind readnone 
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone 
 
 declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 74b5cdd3..8a1f280a 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4533,31 +4533,11 @@ define_down_avgs()
 ')
 
 define(`rsqrt_double', `
-declare double @__rsqrt_safe_uniform_double___und(double, <WIDTH x MASK>)
-define  double @__rsqrt_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
-{
-  %res = call double @__rsqrt_safe_uniform_double___und(double %0, <WIDTH x MASK> %1)
-  ret double %res
-}
-declare <WIDTH x double> @__rsqrt_safe_varying_double___vyd(<WIDTH x double>, <WIDTH x MASK>) 
-define  <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
-{
-  %res = call <WIDTH x double> @__rsqrt_safe_varying_double___vyd(<WIDTH x double> %0, <WIDTH x MASK> %1)
-  ret <WIDTH x double> %res
-}
+declare  double @__rsqrt_uniform_double(double)
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>)
 ')
 
 define(`rcp_double', `
-declare double @__rcp_safe_uniform_double___und(double, <WIDTH x MASK>)
-define  double @__rcp_uniform_double(double, <WIDTH x MASK>) nounwind alwaysinline readnone
-{
-  %res = call double @__rcp_safe_uniform_double___und(double %0, <WIDTH x MASK> %1)
-  ret double %res
-}
-declare <WIDTH x double> @__rcp_safe_varying_double___vyd(<WIDTH x double>, <WIDTH x MASK>) 
-define  <WIDTH x double> @__rcp_varying_double(<WIDTH x double>, <WIDTH x MASK>) nounwind alwaysinline readnone
-{
-  %res = call <WIDTH x double> @__rcp_safe_varying_double___vyd(<WIDTH x double> %0, <WIDTH x MASK> %1)
-  ret <WIDTH x double> %res
-}
+declare  double @__rcp_uniform_double(double)
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')
diff --git a/ispc.cpp b/ispc.cpp
index ed326b14..1386d65e 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     m_hasRand(false),
     m_hasGather(false),
     m_hasScatter(false),
-    m_hasTranscendentals(false)
+    m_hasTranscendentals(false),
+    m_hasRsqrtd(false),
+    m_hasRcpd(false)
 {
     if (isa == NULL) {
         if (cpu != NULL) {
@@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasHalf = true;
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
     }
     else if (!strcasecmp(isa, "generic-8") ||
              !strcasecmp(isa, "generic-x8")) {
@@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasHalf = true;
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
     }
     else if (!strcasecmp(isa, "generic-16") ||
              !strcasecmp(isa, "generic-x16")) {
@@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasHalf = true;
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
     }
     else if (!strcasecmp(isa, "generic-32") ||
              !strcasecmp(isa, "generic-x32")) {
@@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasHalf = true;
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
     }
     else if (!strcasecmp(isa, "generic-64") ||
              !strcasecmp(isa, "generic-x64")) {
@@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasHalf = true;
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
     }
     else if (!strcasecmp(isa, "generic-1") ||
              !strcasecmp(isa, "generic-x1")) {
diff --git a/ispc.h b/ispc.h
index 88eb8353..4b6df8c3 100644
--- a/ispc.h
+++ b/ispc.h
@@ -281,6 +281,10 @@ public:
     bool hasScatter() const {return m_hasScatter;}
 
     bool hasTranscendentals() const {return m_hasTranscendentals;}
+    
+    bool hasRsqrtd() const {return m_hasRsqrtd;}
+    
+    bool hasRcpd() const {return m_hasRcpd;}
 
 private:
 
@@ -380,6 +384,12 @@ private:
     /** Indicates whether the target has support for transcendentals (beyond
         sqrt, which we assume that all of them handle). */
     bool m_hasTranscendentals;
+    
+    /** Indicates whether there is an ISA double precision rsqrt. */
+    bool m_hasRsqrtd;
+    
+    /** Indicates whether there is an ISA double precision rcp. */
+    bool m_hasRcpd;
 };
 
 
diff --git a/stdlib.ispc b/stdlib.ispc
index bd12034c..2f204aa0 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1412,14 +1412,21 @@ static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
 
 RCPD(varying)
 __declspec(safe) 
-static inline double rcp(double v) {
-    return __rcp_varying_double(v, (IntMaskType)__mask);
-}
+__declspec(safe)   
+static inline double rcp(double v) {   
+  if (__have_native_rcpd)
+    return __rcp_varying_double(v);
+  else
+    return __rcp_safe_varying_double(v);
+}   
 
 RCPD(uniform)
-__declspec(safe) 
+__declspec(safe)
 static inline uniform double rcp(uniform double v) {
-    return __rcp_uniform_double(v, (IntMaskType)__mask);
+  if (__have_native_rcpd)
+    return __rcp_uniform_double(v);
+  else
+    return __rcp_safe_uniform_double(v);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -3572,13 +3579,19 @@ static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x)    \
 RSQRTD(varying)
 __declspec(safe)   
 static inline double rsqrt(double v) {   
-  return __rsqrt_varying_double(v, (IntMaskType)__mask);   
+  if (__have_native_rsqrtd)
+    return __rsqrt_varying_double(v);
+  else
+    return __rsqrt_safe_varying_double(v);
 }   
 
 RSQRTD(uniform)
 __declspec(safe)
 static inline uniform double rsqrt(uniform double v) {
-    return __rsqrt_uniform_double(v, (IntMaskType)__mask);
+  if (__have_native_rsqrtd)
+    return __rsqrt_uniform_double(v);
+  else
+    return __rsqrt_safe_uniform_double(v);
 }
 __declspec(safe)
 static inline double ldexp(double x, int n) {

From 732a315a4ba6ecb4b20c6352f31f0d551099b3e5 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:04:45 +0100
Subject: [PATCH 07/13] removed __declspec(safe) duplicate

---
 stdlib.ispc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 2f204aa0..24217cd0 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1411,7 +1411,6 @@ static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
 } 
 
 RCPD(varying)
-__declspec(safe) 
 __declspec(safe)   
 static inline double rcp(double v) {   
   if (__have_native_rcpd)

From 09e8381ec7c15d11d2f973cfaf95028e788a2e03 Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:05:04 +0100
Subject: [PATCH 08/13] change {rsqrt,rcp}_double to {rsqrt,rcp}d_decl

---
 builtins/target-avx-x2.ll         | 4 ++--
 builtins/target-avx.ll            | 5 ++---
 builtins/target-avx1-i64x4base.ll | 4 ++--
 builtins/target-generic-1.ll      | 5 ++---
 builtins/target-neon-16.ll        | 4 ++--
 builtins/target-neon-32.ll        | 4 ++--
 builtins/target-neon-8.ll         | 4 ++--
 builtins/target-sse2-x2.ll        | 4 ++--
 builtins/target-sse2.ll           | 4 ++--
 builtins/target-sse4-16.ll        | 5 ++---
 builtins/target-sse4-8.ll         | 5 ++---
 builtins/target-sse4-x2.ll        | 5 ++---
 builtins/target-sse4.ll           | 5 ++---
 builtins/util.m4                  | 4 ++--
 14 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 802f726b..b3a77871 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -691,6 +691,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 2abed8fe..9738f9d3 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -562,6 +562,5 @@ gen_scatter(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index 1b1e7c4f..a6601a28 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -511,5 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %call
 }
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 40d7ae8c..3dcd8373 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -995,6 +995,5 @@ define_avgs()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
index 0f144b66..8e0ef121 100644
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -519,5 +519,5 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll
index d30efff2..d6e861a2 100644
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -489,5 +489,5 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
index 4b6dfa3a..aaa0a7b7 100644
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -585,5 +585,5 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 41b4be09..bfb927e5 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -656,5 +656,5 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index ac091cdb..93a8eb93 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -591,5 +591,5 @@ gen_scatter(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 55c94ce6..0de5c1b4 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -492,6 +492,5 @@ define_down_avgs()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 4d81df92..79f44212 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -494,6 +494,5 @@ define_down_avgs()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 09088dc3..ceff27f0 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -595,6 +595,5 @@ define_avgs()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 747a6a58..9e2ac8a5 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -518,6 +518,5 @@ define_avgs()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reciprocals in double precision, if supported
 
-rsqrt_double()
-rcp_double()
-
+rsqrtd_decl()
+rcpd_decl()
diff --git a/builtins/util.m4 b/builtins/util.m4
index 8a1f280a..fbd929a1 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4532,12 +4532,12 @@ define_up_avgs()
 define_down_avgs()
 ')
 
-define(`rsqrt_double', `
+define(`rsqrtd_decl', `
 declare  double @__rsqrt_uniform_double(double)
 declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>)
 ')
 
-define(`rcp_double', `
+define(`rcpd_decl', `
 declare  double @__rcp_uniform_double(double)
 declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
 ')

From 688d9c9a820c3fc8e34fe833e0fde16db8a62b12 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:20:44 +0100
Subject: [PATCH 09/13] added support for rsqrtd/rcpd for generic-*.h

---
 examples/intrinsics/generic-16.h | 10 ++++++++++
 examples/intrinsics/generic-32.h | 10 ++++++++++
 examples/intrinsics/generic-64.h | 10 ++++++++++
 3 files changed, 30 insertions(+)

diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 0aa8a3f6..3b5c6ec3 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
     return 1.f / sqrtf(v);
 }
 
+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
     return 1.f / v;
 }
 
+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
     return sqrtf(v);
 }
@@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }
 
 UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 924b049d..f5bb233c 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
     return 1.f / sqrtf(v);
 }
 
+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
     return 1.f / v;
 }
 
+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
     return sqrtf(v);
 }
@@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }
 
 UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double)
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index b1451c96..a7148c8b 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
     return 1.f / sqrtf(v);
 }
 
+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
     return 1.f / v;
 }
 
+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
     return sqrtf(v);
 }
@@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }
 
 UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double)
 

From f225b558ec21b07cfbebfe7f84cedd2fb882405a Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:42:45 +0100
Subject: [PATCH 10/13] added {rsqrt,rcp}d support for sse4.h

---
 examples/intrinsics/sse4.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 5dd424d9..45b31be1 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) {
     return r;
 }
 
+static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) {
+    __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
+    __vec4_d  exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
+    __vec4_f   xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
+    __vec4_f   yf = __rcp_varying_float(xf);
+    __vec4_d    y = __mul(__cast_fpext(__vec4_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0);
+}
+
 static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
     __m128 rsqrt = _mm_rsqrt_ps(v.v);
     // Newton-Raphson iteration to improve precision
@@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
     __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
     return half_scale;
 }
+static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) {
+    __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
+    __vec4_d  exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
+    __vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec4_f   xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
+    __vec4_f   yf = __rsqrt_varying_float(xf);
+    __vec4_d    y = __mul(__cast_fpext(__vec4_d(), yf), exph);
+    __vec4_d   xh = __mul(x, __smear_double<__vec4_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0);
+}
 
 static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) {
     return _mm_sqrt_ps(v.v);

From eb01ffd4e6b11e0e3679fe59d316cfb629cb283d Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:43:07 +0100
Subject: [PATCH 11/13] first commit for {rsqrt,rcp}d knc support. going to
 test on other node now

---
 examples/intrinsics/knc-i1x16.h | 31 +++++++++++++++++++++++++++++++
 examples/intrinsics/knc-i1x8.h  | 30 ++++++++++++++++++++++++++++++
 examples/intrinsics/knc.h       | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)

diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h
index 141c47bb..ba6ef005 100644
--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v)
   return _mm512_recip_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rcp_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
+}
 
 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
 {
@@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
   return _mm512_invsqrt_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rsqrt_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exph);
+    __vec16_d   xh = __mul(x, __smear_double<__vec16_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
+}
+
 static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
 static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
 
diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index 32f39c4a..4a5b28a9 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
     return _mm512_mask_recip_ps(FZERO, 0xFF, v);
 #endif
 }
+static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) {
+    __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
+    __vec8_d  exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
+    __vec8_f   xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
+    __vec8_f   yf = __rcp_varying_float(xf);
+    __vec8_d    y = __mul(__cast_fpext(__vec8_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0);
+}
 
 static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
 #ifdef ISPC_FAST_MATH
@@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
     return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
 #endif
 }
+static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) {
+    __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
+    __vec8_d  exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
+    __vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec8_f   xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
+    __vec8_f   yf = __rsqrt_varying_float(xf);
+    __vec8_d    y = __mul(__cast_fpext(__vec8_d(), yf), exph);
+    __vec8_d   xh = __mul(x, __smear_double<__vec8_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0);
+}
 static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
 #endif
 
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 0077ad88..72951845 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1472,6 +1472,21 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
     return _mm512_recip_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rcp_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
+}
+
 
 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
 #ifdef ISPC_FAST_MATH
@@ -1480,6 +1495,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
     return _mm512_invsqrt_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rsqrt_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exph);
+    __vec16_d   xh = __mul(x, __smear_double<__vec16_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
+}
+
 
 static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
   return _mm512_exp_ps(v);

From ecc9c88ff80ed44fdb31ff468103192ad4bc1e3b Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:52:24 +0100
Subject: [PATCH 12/13] fix packed_store_active2 for knc-i1x8.h

---
 examples/intrinsics/knc-i1x8.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h
index 4a5b28a9..478ad75a 100644
--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -2526,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
     _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
     return _mm_countbits_32(uint32_t(0xFF & mask));
 }
-static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
-                                                 __vec4_i1 mask) {
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
     return __packed_store_active(ptr, val, mask);
 }
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
@@ -2538,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
                                                  __vec8_i1 mask) {
     return __packed_store_active((uint32_t *)p, val, mask);
 }
-static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
-                                                 __vec4_i1 mask) {
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
     return __packed_store_active(ptr, val, mask);
 }
 

From c59cff396d4df0ffc74e9425072699b5ea750b7d Mon Sep 17 00:00:00 2001
From: evghenii <egaburov@dds.nl>
Date: Wed, 5 Feb 2014 13:55:38 +0100
Subject: [PATCH 13/13] added {rsqrt,rcp}d support for knc.h. test-147.ispc &
 test-148.ispc pass.

---
 examples/intrinsics/knc.h | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 72951845..458da458 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1473,18 +1473,14 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
 #endif
 }
 static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
-    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
-    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
-    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
-    __vec16_f   yf = __rcp_varying_float(xf);
-    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exp);
-    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
-    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
-    return y;
+  __vec16_d y;
+  for (int i = 0; i < 16; i++)
+    __insert_element(&y, i, 1.0/__extract_element(x,i));
+  return y;
 }
 static FORCEINLINE double __rcp_uniform_double(double v) 
 {
-  return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
+  return 1.0/v;
 }
 
 
@@ -1496,20 +1492,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
 #endif
 }
 static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
-    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
-    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
-    __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
-    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
-    __vec16_f   yf = __rsqrt_varying_float(xf);
-    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exph);
-    __vec16_d   xh = __mul(x, __smear_double<__vec16_d>(0.5));
-    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
-    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
-    return y;
+  __vec16_d y;
+  for (int i = 0; i < 16; i++)
+    __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
+  return y;
 }
 static FORCEINLINE double __rsqrt_uniform_double(double v) 
 {
-  return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
+  return 1.0/v;
 }