diff --git a/builtins.cpp b/builtins.cpp index d6eefde5..fee322e7 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__rcp_uniform_float", "__rcp_varying_float", + "__rcp_uniform_double", + "__rcp_varying_double", "__rdrand_i16", "__rdrand_i32", "__rdrand_i64", @@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) { "__round_varying_float", "__rsqrt_uniform_float", "__rsqrt_varying_float", + "__rsqrt_uniform_double", + "__rsqrt_varying_double", "__set_system_isa", "__sext_uniform_bool", "__sext_varying_bool", @@ -1146,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(), module, symbolTable); + lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(), + module, symbolTable); + lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(), + module, symbolTable); if (g->forceAlignment != -1) { llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true); diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index f8fd5cd5..b3a77871 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1) ret <16 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() + diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e98a3843..9738f9d3 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -559,3 +559,8 @@ gen_scatter(float) gen_scatter(i64) gen_scatter(double) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index e1832030..a6601a28 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %call } +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index c43a12a7..3dcd8373 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -992,3 +992,8 @@ declare @__float_to_half_varying( %v) nounwind read define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 2b2b21c9..401c862d 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -191,9 +191,14 @@ declare @__max_varying_double(, declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone +declare double @__rsqrt_uniform_double(double) nounwind readnone +declare double @__rcp_uniform_double(double) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone +declare @__rcp_varying_double() nounwind readnone +declare @__rsqrt_varying_double() nounwind readnone + declare @__sqrt_varying_float() nounwind readnone declare double @__sqrt_uniform_double(double) nounwind readnone diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll index a0575927..8e0ef121 100644 --- a/builtins/target-neon-16.ll +++ b/builtins/target-neon-16.ll @@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) ret <8 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll index 30b062c9..d6e861a2 100644 --- a/builtins/target-neon-32.ll +++ b/builtins/target-neon-32.ll @@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) ret <4 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll index 2accfe53..aaa0a7b7 100644 --- a/builtins/target-neon-8.ll +++ b/builtins/target-neon-8.ll @@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { v8tov16(i16, %r0, %r1, %r) ret <16 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 77bf1a9d..bfb927e5 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e42d4990..93a8eb93 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -587,3 +587,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 72b81ff0..0de5c1b4 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -488,3 +488,9 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { define_avg_up_int8() define_avg_up_int16() define_down_avgs() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 69b355e3..79f44212 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -490,3 +490,9 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { define_avg_up_int8() define_avg_up_int16() define_down_avgs() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 842db53f..ceff27f0 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -592,3 +592,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 16177b47..9e2ac8a5 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -515,3 +515,8 @@ gen_scatter(double) define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/util.m4 b/builtins/util.m4 index f9ae7cd1..fbd929a1 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4531,3 +4531,13 @@ define(`define_avgs', ` define_up_avgs() define_down_avgs() ') + +define(`rsqrtd_decl', ` +declare double @__rsqrt_uniform_double(double) +declare @__rsqrt_varying_double() +') + +define(`rcpd_decl', ` +declare double @__rcp_uniform_double(double) +declare @__rcp_varying_double() +') diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 0aa8a3f6..3b5c6ec3 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 924b049d..f5bb233c 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index b1451c96..a7148c8b 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 141c47bb..ba6ef005 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rcp_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); +} static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { @@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rsqrt_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); + __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 32f39c4a..478ad75a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) { return _mm512_mask_recip_ps(FZERO, 0xFF, v); #endif } +static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rcp_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { #ifdef ISPC_FAST_MATH @@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { return _mm512_mask_invsqrt_ps(FZERO,0xFF,v); #endif } +static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rsqrt_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exph); + __vec8_d xh = __mul(x, __smear_double<__vec8_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);} #endif @@ -2496,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } -static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val, + __vec8_i1 mask) { return __packed_store_active(ptr, val, mask); } static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, @@ -2508,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { return __packed_store_active((uint32_t *)p, val, mask); } -static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val, + __vec8_i1 mask) { return __packed_store_active(ptr, val, mask); } diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0077ad88..458da458 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1472,6 +1472,17 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_d y; + for (int i = 0; i < 16; i++) + __insert_element(&y, i, 1.0/__extract_element(x,i)); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return 1.0/v; +} + static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH @@ -1480,6 +1491,17 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_d y; + for (int i = 0; i < 16; i++) + __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return 1.0/v; +} + static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 5dd424d9..45b31be1 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) { return r; } +static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) { + __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000)); + __vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex)); + __vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp)); + __vec4_f yf = __rcp_varying_float(xf); + __vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0); +} + static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) { __m128 rsqrt = _mm_rsqrt_ps(v.v); // Newton-Raphson iteration to improve precision @@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) { __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul); return half_scale; } +static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) { + __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000)); + __vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex)); + __vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp)); + __vec4_f yf = __rsqrt_varying_float(xf); + __vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exph); + __vec4_d xh = __mul(x, __smear_double<__vec4_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0); +} static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) { return _mm_sqrt_ps(v.v); diff --git a/ispc.cpp b/ispc.cpp index ed326b14..1386d65e 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_hasRand(false), m_hasGather(false), m_hasScatter(false), - m_hasTranscendentals(false) + m_hasTranscendentals(false), + m_hasRsqrtd(false), + m_hasRcpd(false) { if (isa == NULL) { if (cpu != NULL) { @@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-8") || !strcasecmp(isa, "generic-x8")) { @@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-16") || !strcasecmp(isa, "generic-x16")) { @@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-32") || !strcasecmp(isa, "generic-x32")) { @@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-64") || !strcasecmp(isa, "generic-x64")) { @@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-1") || !strcasecmp(isa, "generic-x1")) { diff --git a/ispc.h b/ispc.h index 88eb8353..4b6df8c3 100644 --- a/ispc.h +++ b/ispc.h @@ -281,6 +281,10 @@ public: bool hasScatter() const {return m_hasScatter;} bool hasTranscendentals() const {return m_hasTranscendentals;} + + bool hasRsqrtd() const {return m_hasRsqrtd;} + + bool hasRcpd() const {return m_hasRcpd;} private: @@ -380,6 +384,12 @@ private: /** Indicates whether the target has support for transcendentals (beyond sqrt, which we assume that all of them handle). */ bool m_hasTranscendentals; + + /** Indicates whether there is an ISA double precision rsqrt. */ + bool m_hasRsqrtd; + + /** Indicates whether there is an ISA double precision rcp. */ + bool m_hasRcpd; }; diff --git a/stdlib.ispc b/stdlib.ispc index 3b17283d..24217cd0 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1391,6 +1391,43 @@ static inline uniform float rcp(uniform float v) { return __rcp_uniform_float(v); } +#define RCPD(QUAL) \ +__declspec(safe) \ +static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \ +{ \ + iv = iv * (2.0d - v*iv); \ + iv = iv * (2.0d - v*iv); \ + return iv; \ +} \ +__declspec(safe) \ +static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \ +{ \ + if (x <= 1.0d+33 && x >= 1.0d-33) \ + return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \ + QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \ + QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \ + QUAL double y = rcp((QUAL float)(x*exp)); \ + return __rcp_iterate_##QUAL##_double(x, y*exp); \ +} + +RCPD(varying) +__declspec(safe) +static inline double rcp(double v) { + if (__have_native_rcpd) + return __rcp_varying_double(v); + else + return __rcp_safe_varying_double(v); +} + +RCPD(uniform) +__declspec(safe) +static inline uniform double rcp(uniform double v) { + if (__have_native_rcpd) + return __rcp_uniform_double(v); + else + return __rcp_safe_uniform_double(v); +} + /////////////////////////////////////////////////////////////////////////// // min/max @@ -3517,6 +3554,44 @@ static inline uniform double sqrt(uniform double v) { return __sqrt_uniform_double(v); } +#define RSQRTD(QUAL) \ +__declspec(safe) \ +static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \ +{ \ + QUAL double xh = x*0.5d; \ + y += y*(0.5d0 - xh*y*y); \ + y += y*(0.5d0 - xh*y*y); \ + return y; \ +} \ +__declspec(safe) \ +static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \ +{ \ + if (x <= 1.0d+33 && x >= 1.0d-33) \ + return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \ + QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \ + QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \ + QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \ + QUAL double y = rsqrt((QUAL float)(x*exp)); \ + return __rsqrt_iterate_##QUAL##_double(x, y*exph); \ +} + +RSQRTD(varying) +__declspec(safe) +static inline double rsqrt(double v) { + if (__have_native_rsqrtd) + return __rsqrt_varying_double(v); + else + return __rsqrt_safe_varying_double(v); +} + +RSQRTD(uniform) +__declspec(safe) +static inline uniform double rsqrt(uniform double v) { + if (__have_native_rsqrtd) + return __rsqrt_uniform_double(v); + else + return __rsqrt_safe_uniform_double(v); +} __declspec(safe) static inline double ldexp(double x, int n) { unsigned int64 ex = 0x7ff0000000000000; diff --git a/tests/test-147.ispc b/tests/test-147.ispc new file mode 100644 index 00000000..5c5d4bc1 --- /dev/null +++ b/tests/test-147.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double x = aFOO[programIndex&0x3]*1d100; + double d, ix; + ix = rcp(x); + d = (ix - 1.0d0 / x); + d = (d < 0.0d0) ? -d : d; + RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1.0d0; +} diff --git a/tests/test-148.ispc b/tests/test-148.ispc new file mode 100644 index 00000000..1c01428c --- /dev/null +++ b/tests/test-148.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double x = aFOO[programIndex]*1d100; + double d, invsqrt = rsqrt(x); + d = (x * (invsqrt * invsqrt)) - 1.0d0; + if (d < 0.0d0) d = -d; + RET[programIndex] = d > 1d-15; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +}