From eb01ffd4e6b11e0e3679fe59d316cfb629cb283d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 5 Feb 2014 13:43:07 +0100 Subject: [PATCH] first commit for {rsqrt,rcp}d knc support. going to test on other node now --- examples/intrinsics/knc-i1x16.h | 31 +++++++++++++++++++++++++++++++ examples/intrinsics/knc-i1x8.h | 30 ++++++++++++++++++++++++++++++ examples/intrinsics/knc.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 141c47bb..ba6ef005 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rcp_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); +} static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { @@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rsqrt_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); + __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 32f39c4a..4a5b28a9 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) { return _mm512_mask_recip_ps(FZERO, 0xFF, v); #endif } +static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rcp_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { #ifdef ISPC_FAST_MATH @@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { return _mm512_mask_invsqrt_ps(FZERO,0xFF,v); #endif } +static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rsqrt_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exph); + __vec8_d xh = __mul(x, __smear_double<__vec8_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);} #endif diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0077ad88..72951845 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1472,6 +1472,21 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rcp_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH @@ -1480,6 +1495,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rsqrt_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); + __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v);