Merge pull request #730 from egaburov/double_math

Added double precision support for reciprocals: rsqrt rcp
2014-02-05 17:57:39 +03:00
parent 9a3b949687 c59cff396d
commit 2570385770
28 changed files with 367 additions and 5 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__prefetch_read_uniform_nt",
        "__rcp_uniform_float",
        "__rcp_varying_float",
+        "__rcp_uniform_double",
+        "__rcp_varying_double",
        "__rdrand_i16",
        "__rdrand_i32",
        "__rdrand_i64",
@@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) {
        "__round_varying_float",
        "__rsqrt_uniform_float",
        "__rsqrt_varying_float",
+        "__rsqrt_uniform_double",
+        "__rsqrt_varying_double",
        "__set_system_isa",
        "__sext_uniform_bool",
        "__sext_varying_bool",
@@ -1146,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                       symbolTable);
    lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
                       module, symbolTable);
+    lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(),
+                       module, symbolTable);
+    lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
+                       module, symbolTable);

    if (g->forceAlignment != -1) {
        llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
  binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
  ret <16 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
+
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -559,3 +559,8 @@ gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
  ret <4 x double> %call
 }

+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -992,3 +992,8 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read

 define_avgs()

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -191,9 +191,14 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,

 declare float @__rsqrt_uniform_float(float) nounwind readnone 
 declare float @__rcp_uniform_float(float) nounwind readnone 
+declare double @__rsqrt_uniform_double(double) nounwind readnone 
+declare double @__rcp_uniform_double(double) nounwind readnone 
 declare float @__sqrt_uniform_float(float) nounwind readnone 
 declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
 declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone 
+
 declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 

 declare double @__sqrt_uniform_double(double) nounwind readnone
--- a/builtins/target-neon-16.ll
+++ b/builtins/target-neon-16.ll
@@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
  ret <8 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-neon-32.ll
+++ b/builtins/target-neon-32.ll
@@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
  ret <4 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-neon-8.ll
+++ b/builtins/target-neon-8.ll
@@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
  v8tov16(i16, %r0, %r1, %r)
  ret <16 x i16> %r
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
  ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -587,3 +587,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -488,3 +488,9 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
 define_avg_up_int8()
 define_avg_up_int16()
 define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -490,3 +490,9 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
 define_avg_up_int8()
 define_avg_up_int16()
 define_down_avgs()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -592,3 +592,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r

 define_avgs()

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -515,3 +515,8 @@ gen_scatter(double)

 define_avgs()

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reciprocals in double precision, if supported
+
+rsqrtd_decl()
+rcpd_decl()
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -4531,3 +4531,13 @@ define(`define_avgs', `
 define_up_avgs()
 define_down_avgs()
 ')
+
+define(`rsqrtd_decl', `
+declare  double @__rsqrt_uniform_double(double)
+declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>)
+')
+
+define(`rcpd_decl', `
+declare  double @__rcp_uniform_double(double)
+declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
+')
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
    return 1.f / sqrtf(v);
 }

+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
    return 1.f / v;
 }

+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
    return sqrtf(v);
 }
@@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }

 UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)

--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
    return 1.f / sqrtf(v);
 }

+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
    return 1.f / v;
 }

+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
    return sqrtf(v);
 }
@@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }

 UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double)

--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
    return 1.f / sqrtf(v);
 }

+static FORCEINLINE double __rsqrt_uniform_double(double v) {
+    return 1.0 / sqrt(v);
+}
+
 static FORCEINLINE float __rcp_uniform_float(float v) {
    return 1.f / v;
 }

+static FORCEINLINE double __rcp_uniform_double(double v) {
+    return 1.0 / v;
+}
+
 static FORCEINLINE float __sqrt_uniform_float(float v) {
    return sqrtf(v);
 }
@@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
 }

 UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float)
+UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double)
 UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float)
+UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double)
 UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float)
 UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double)

--- a/examples/intrinsics/knc-i1x16.h
+++ b/examples/intrinsics/knc-i1x16.h
@@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v)
  return _mm512_recip_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rcp_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
+}

 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) 
 {
@@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
  return _mm512_invsqrt_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
+    __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
+    __vec16_d  exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
+    __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec16_f   xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
+    __vec16_f   yf = __rsqrt_varying_float(xf);
+    __vec16_d    y = __mul(__cast_fpext(__vec16_d(), yf), exph);
+    __vec16_d   xh = __mul(x, __smear_double<__vec16_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
+}
+
 static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
 static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}

--- a/examples/intrinsics/knc-i1x8.h
+++ b/examples/intrinsics/knc-i1x8.h
@@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
    return _mm512_mask_recip_ps(FZERO, 0xFF, v);
 #endif
 }
+static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) {
+    __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
+    __vec8_d  exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
+    __vec8_f   xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
+    __vec8_f   yf = __rcp_varying_float(xf);
+    __vec8_d    y = __mul(__cast_fpext(__vec8_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0);
+}

 static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
 #ifdef ISPC_FAST_MATH
@@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
    return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
 #endif
 }
+static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) {
+    __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
+    __vec8_d  exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
+    __vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec8_f   xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
+    __vec8_f   yf = __rsqrt_varying_float(xf);
+    __vec8_d    y = __mul(__cast_fpext(__vec8_d(), yf), exph);
+    __vec8_d   xh = __mul(x, __smear_double<__vec8_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0);
+}
 static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) {    return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
 #endif

@@ -2496,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
    _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
    return _mm_countbits_32(uint32_t(0xFF & mask));
 }
-static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
-                                                 __vec4_i1 mask) {
+static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
    return __packed_store_active(ptr, val, mask);
 }
 static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
@@ -2508,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
                                                 __vec8_i1 mask) {
    return __packed_store_active((uint32_t *)p, val, mask);
 }
-static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
-                                                 __vec4_i1 mask) {
+static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val,
+                                                 __vec8_i1 mask) {
    return __packed_store_active(ptr, val, mask);
 }

--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1472,6 +1472,17 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
    return _mm512_recip_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
+  __vec16_d y;
+  for (int i = 0; i < 16; i++)
+    __insert_element(&y, i, 1.0/__extract_element(x,i));
+  return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return 1.0/v;
+}
+

 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
 #ifdef ISPC_FAST_MATH
@@ -1480,6 +1491,17 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
    return _mm512_invsqrt_ps(v);
 #endif
 }
+static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
+  __vec16_d y;
+  for (int i = 0; i < 16; i++)
+    __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
+  return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return 1.0/v;
+}
+

 static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
  return _mm512_exp_ps(v);
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) {
    return r;
 }

+static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) {
+    __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
+    __vec4_d  exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
+    __vec4_f   xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
+    __vec4_f   yf = __rcp_varying_float(xf);
+    __vec4_d    y = __mul(__cast_fpext(__vec4_d(), yf), exp);
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
+    return y;
+}
+static FORCEINLINE double __rcp_uniform_double(double v) 
+{
+  return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0);
+}
+
 static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
    __m128 rsqrt = _mm_rsqrt_ps(v.v);
    // Newton-Raphson iteration to improve precision
@@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
    __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
    return half_scale;
 }
+static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) {
+    __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
+    __vec4_d  exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
+    __vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1)));
+    __vec4_f   xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
+    __vec4_f   yf = __rsqrt_varying_float(xf);
+    __vec4_d    y = __mul(__cast_fpext(__vec4_d(), yf), exph);
+    __vec4_d   xh = __mul(x, __smear_double<__vec4_d>(0.5));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
+    y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
+    return y;
+}
+static FORCEINLINE double __rsqrt_uniform_double(double v) 
+{
+  return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0);
+}

 static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) {
    return _mm_sqrt_ps(v.v);
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
    m_hasRand(false),
    m_hasGather(false),
    m_hasScatter(false),
-    m_hasTranscendentals(false)
+    m_hasTranscendentals(false),
+    m_hasRsqrtd(false),
+    m_hasRcpd(false)
 {
    if (isa == NULL) {
        if (cpu != NULL) {
@@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasHalf = true;
        this->m_hasTranscendentals = true;
        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
    }
    else if (!strcasecmp(isa, "generic-8") ||
             !strcasecmp(isa, "generic-x8")) {
@@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasHalf = true;
        this->m_hasTranscendentals = true;
        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
    }
    else if (!strcasecmp(isa, "generic-16") ||
             !strcasecmp(isa, "generic-x16")) {
@@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasHalf = true;
        this->m_hasTranscendentals = true;
        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
    }
    else if (!strcasecmp(isa, "generic-32") ||
             !strcasecmp(isa, "generic-x32")) {
@@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasHalf = true;
        this->m_hasTranscendentals = true;
        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
    }
    else if (!strcasecmp(isa, "generic-64") ||
             !strcasecmp(isa, "generic-x64")) {
@@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasHalf = true;
        this->m_hasTranscendentals = true;
        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasRsqrtd = this->m_hasRcpd = true;
    }
    else if (!strcasecmp(isa, "generic-1") ||
             !strcasecmp(isa, "generic-x1")) {
--- a/ispc.h
+++ b/ispc.h
@@ -281,6 +281,10 @@ public:
    bool hasScatter() const {return m_hasScatter;}

    bool hasTranscendentals() const {return m_hasTranscendentals;}
+    
+    bool hasRsqrtd() const {return m_hasRsqrtd;}
+    
+    bool hasRcpd() const {return m_hasRcpd;}

 private:

@@ -380,6 +384,12 @@ private:
    /** Indicates whether the target has support for transcendentals (beyond
        sqrt, which we assume that all of them handle). */
    bool m_hasTranscendentals;
+    
+    /** Indicates whether there is an ISA double precision rsqrt. */
+    bool m_hasRsqrtd;
+    
+    /** Indicates whether there is an ISA double precision rcp. */
+    bool m_hasRcpd;
 };


--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1391,6 +1391,43 @@ static inline uniform float rcp(uniform float v) {
    return __rcp_uniform_float(v);
 }

+#define RCPD(QUAL) \
+__declspec(safe)  \
+static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
+{ \
+  iv = iv * (2.0d - v*iv); \
+  iv = iv * (2.0d - v*iv); \
+  return iv; \
+} \
+__declspec(safe)  \
+static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
+{ \
+  if (x <= 1.0d+33 && x >= 1.0d-33) \
+    return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
+  QUAL int64  ex   = intbits(x) & 0x7fe0000000000000; \
+  QUAL double exp  = doublebits(  0x7fd0000000000000 + ~ex      );   \
+  QUAL double   y  = rcp((QUAL float)(x*exp)); \
+  return __rcp_iterate_##QUAL##_double(x, y*exp); \
+} 
+
+RCPD(varying)
+__declspec(safe)   
+static inline double rcp(double v) {   
+  if (__have_native_rcpd)
+    return __rcp_varying_double(v);
+  else
+    return __rcp_safe_varying_double(v);
+}   
+
+RCPD(uniform)
+__declspec(safe)
+static inline uniform double rcp(uniform double v) {
+  if (__have_native_rcpd)
+    return __rcp_uniform_double(v);
+  else
+    return __rcp_safe_uniform_double(v);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // min/max

@@ -3517,6 +3554,44 @@ static inline uniform double sqrt(uniform double v) {
    return __sqrt_uniform_double(v);
 }

+#define RSQRTD(QUAL)  \
+__declspec(safe)    \
+static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y)   \
+{   \
+  QUAL double xh = x*0.5d;    \
+  y += y*(0.5d0 - xh*y*y);    \
+  y += y*(0.5d0 - xh*y*y);    \
+  return y;   \
+}   \
+__declspec(safe)    \
+static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x)    \
+{   \
+  if (x <= 1.0d+33 && x >= 1.0d-33)   \
+    return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x));   \
+  QUAL int64  ex   = intbits(x) & 0x7fe0000000000000;                 \
+  QUAL double exp  = doublebits(  0x7fd0000000000000 -  ex      );   /* 1.0d/exponent  */   \
+  QUAL double exph = doublebits(  0x5fe0000000000000 - (ex >> 1));   /* 1.0d/sqrt(exponent) */    \
+  QUAL double   y  = rsqrt((QUAL float)(x*exp));          \
+  return __rsqrt_iterate_##QUAL##_double(x, y*exph);    \
+}
+
+RSQRTD(varying)
+__declspec(safe)   
+static inline double rsqrt(double v) {   
+  if (__have_native_rsqrtd)
+    return __rsqrt_varying_double(v);
+  else
+    return __rsqrt_safe_varying_double(v);
+}   
+
+RSQRTD(uniform)
+__declspec(safe)
+static inline uniform double rsqrt(uniform double v) {
+  if (__have_native_rsqrtd)
+    return __rsqrt_uniform_double(v);
+  else
+    return __rsqrt_safe_uniform_double(v);
+}
 __declspec(safe)
 static inline double ldexp(double x, int n) {
    unsigned int64 ex = 0x7ff0000000000000;
--- a/tests/test-147.ispc
+++ b/tests/test-147.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double x = aFOO[programIndex&0x3]*1d100;
+    double d, ix; 
+    ix = rcp(x);
+    d = (ix - 1.0d0 / x);
+    d = (d < 0.0d0) ? -d : d;
+    RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1.0d0;
+}
--- a/tests/test-148.ispc
+++ b/tests/test-148.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double x = aFOO[programIndex]*1d100;
+    double d, invsqrt = rsqrt(x);
+    d = (x * (invsqrt * invsqrt)) - 1.0d0;
+    if (d < 0.0d0) d = -d;
+    RET[programIndex] = d > 1d-15;
+}
+
+
+export void result(uniform float RET[]) {
+       RET[programIndex] = 0;
+}