From 3a72e05c3e8cfacc797a281424cb0b1e95668904 Mon Sep 17 00:00:00 2001 From: evghenii Date: Sun, 2 Feb 2014 18:16:48 +0100 Subject: [PATCH 01/13] +1 --- builtins.cpp | 4 ++++ builtins/target-avx-x2.ll | 7 +++++++ builtins/target-avx.ll | 6 ++++++ builtins/target-generic-1.ll | 6 ++++++ builtins/target-generic-common.ll | 4 ++++ builtins/target-neon-16.ll | 6 ++++++ builtins/target-neon-32.ll | 6 ++++++ builtins/target-neon-8.ll | 6 ++++++ builtins/target-sse2-x2.ll | 6 ++++++ builtins/target-sse2.ll | 6 ++++++ builtins/target-sse4-16.ll | 7 +++++++ builtins/target-sse4-8.ll | 7 +++++++ builtins/target-sse4-x2.ll | 6 ++++++ builtins/target-sse4.ll | 6 ++++++ builtins/util.m4 | 34 +++++++++++++++++++++++++++++++ stdlib.ispc | 9 ++++++++ 16 files changed, 126 insertions(+) diff --git a/builtins.cpp b/builtins.cpp index d6eefde5..da66ed5f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__rcp_uniform_float", "__rcp_varying_float", + "__rcp_uniform_double", + "__rcp_varying_double", "__rdrand_i16", "__rdrand_i32", "__rdrand_i64", @@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) { "__round_varying_float", "__rsqrt_uniform_float", "__rsqrt_varying_float", + "__rsqrt_uniform_double", + "__rsqrt_varying_double", "__set_system_isa", "__sext_uniform_bool", "__sext_varying_bool", diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index f8fd5cd5..802f726b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1) ret <16 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e98a3843..2abed8fe 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -559,3 +559,9 @@ gen_scatter(float) gen_scatter(i64) gen_scatter(double) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index c43a12a7..40d7ae8c 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -992,3 +992,9 @@ declare @__float_to_half_varying( %v) nounwind read define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 2b2b21c9..4f87ad26 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -191,9 +191,13 @@ declare @__max_varying_double(, declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone +declare double @__rsqrt_uniform_double(double) nounwind readnone +declare double @__rcp_uniform_double(double) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone +declare @__rcp_varying_double() nounwind readnone +declare @__rsqrt_varying_double() nounwind readnone declare @__sqrt_varying_float() nounwind readnone declare double @__sqrt_uniform_double(double) nounwind readnone diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll index a0575927..0f144b66 100644 --- a/builtins/target-neon-16.ll +++ b/builtins/target-neon-16.ll @@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) ret <8 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll index 30b062c9..d30efff2 100644 --- a/builtins/target-neon-32.ll +++ b/builtins/target-neon-32.ll @@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) ret <4 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll index 2accfe53..4b6dfa3a 100644 --- a/builtins/target-neon-8.ll +++ b/builtins/target-neon-8.ll @@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { v8tov16(i16, %r0, %r1, %r) ret <16 x i16> %r } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 77bf1a9d..41b4be09 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e42d4990..ac091cdb 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -587,3 +587,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 72b81ff0..55c94ce6 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -488,3 +488,10 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { define_avg_up_int8() define_avg_up_int16() define_down_avgs() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 69b355e3..4d81df92 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -490,3 +490,10 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { define_avg_up_int8() define_avg_up_int16() define_down_avgs() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 842db53f..09088dc3 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -592,3 +592,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 16177b47..747a6a58 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -515,3 +515,9 @@ gen_scatter(double) define_avgs() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reciprocals in double precision, if supported + +rsqrt_double() +rcp_double() + diff --git a/builtins/util.m4 b/builtins/util.m4 index f9ae7cd1..a8340adb 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4531,3 +4531,37 @@ define(`define_avgs', ` define_up_avgs() define_down_avgs() ') + +define(`rsqrt_double', ` +define double @__rsqrt_uniform_double(double) nounwind alwaysinline readnone +{ + %flt = fptrunc double %0 to float + %res = call float @__rsqrt_uniform_float(float %flt) + %dres = fpext float %res to double + ret double %dres +} +define @__rsqrt_varying_double() nounwind alwaysinline readnone +{ + %flt = fptrunc %0 to + %res = call @__rsqrt_varying_float( %flt) + %dres = fpext %res to + ret %dres +} +') + +define(`rcp_double', ` +define double @__rcp_uniform_double(double) nounwind alwaysinline readnone +{ + %flt = fptrunc double %0 to float + %res = call float @__rcp_uniform_float(float %flt) + %dres = fpext float %res to double + ret double %dres +} +define @__rcp_varying_double() nounwind alwaysinline readnone +{ + %flt = fptrunc %0 to + %res = call @__rcp_varying_float( %flt) + %dres = fpext %res to + ret %dres +} +') diff --git a/stdlib.ispc b/stdlib.ispc index 3b17283d..de2221b2 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -3517,6 +3517,15 @@ static inline uniform double sqrt(uniform double v) { return __sqrt_uniform_double(v); } +__declspec(safe) +static inline double rsqrt(double v) { + return __rsqrt_varying_double(v); +} + +__declspec(safe) +static inline uniform double rsqrt(uniform double v) { + return __rsqrt_uniform_double(v); +} __declspec(safe) static inline double ldexp(double x, int n) { unsigned int64 ex = 0x7ff0000000000000; From 4515dd5c89e6b3ceecfe8d9eb5bffa7a519aaac8 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sun, 2 Feb 2014 18:19:56 +0100 Subject: [PATCH 02/13] added tests for rcp/rsqrt double --- tests/test-147.ispc | 16 ++++++++++++++++ tests/test-148.ispc | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 tests/test-147.ispc create mode 100644 tests/test-148.ispc diff --git a/tests/test-147.ispc b/tests/test-147.ispc new file mode 100644 index 00000000..4091a78d --- /dev/null +++ b/tests/test-147.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double x = aFOO[programIndex&0x3]; + double d, ix; + ix = rcp(x); + d = (ix - 1.0d0 / x); + d = (d < 0.0d0) ? -d : d; + RET[programIndex] = (d < 1d-7) ? 1.0d0 : 0.0d0; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1.0d0; +} diff --git a/tests/test-148.ispc b/tests/test-148.ispc new file mode 100644 index 00000000..f8284e1a --- /dev/null +++ b/tests/test-148.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double x = aFOO[programIndex]; + double d, invsqrt = rsqrt(x); + d = (x * (invsqrt * invsqrt)) - 1.0d0; + if (d < 0.0d0) d = -d; + RET[programIndex] = d > 1d-5; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} From b0753dc93d0dfb31e7af74a25f13dd656161bd73 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sun, 2 Feb 2014 18:20:05 +0100 Subject: [PATCH 03/13] added double-version for rcp --- stdlib.ispc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/stdlib.ispc b/stdlib.ispc index de2221b2..42b60b52 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1391,6 +1391,16 @@ static inline uniform float rcp(uniform float v) { return __rcp_uniform_float(v); } +__declspec(safe) +static inline double rcp(double v) { + return __rcp_varying_double(v); +} + +__declspec(safe) +static inline uniform double rcp(uniform double v) { + return __rcp_uniform_double(v); +} + /////////////////////////////////////////////////////////////////////////// // min/max From eb1a495a7aab85773fdfd5cad37e29c54d380460 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 4 Feb 2014 14:44:54 +0100 Subject: [PATCH 04/13] added support for fast approximate rsqrt(double). Provide 16 digit accurancy but is over 3x faster than 1/sqrt(double) --- builtins/target-avx1-i64x4base.ll | 2 ++ builtins/target-generic-common.ll | 9 ++++---- builtins/util.m4 | 22 +++++++++---------- stdlib.ispc | 35 +++++++++++++++++++++++++------ tests/test-148.ispc | 4 ++-- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index e1832030..1b1e7c4f 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %call } +rsqrt_double() +rcp_double() diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 4f87ad26..dfdc7e9e 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -191,13 +191,14 @@ declare @__max_varying_double(, declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone -declare double @__rsqrt_uniform_double(double) nounwind readnone -declare double @__rcp_uniform_double(double) nounwind readnone +declare double @__rsqrt_uniform_double(double, ) nounwind readnone +declare double @__rcp_uniform_double(double, ) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone -declare @__rcp_varying_double() nounwind readnone -declare @__rsqrt_varying_double() nounwind readnone +declare @__rcp_varying_double(, ) nounwind readnone +declare @__rsqrt_varying_double(, ) nounwind readnone + declare @__sqrt_varying_float() nounwind readnone declare double @__sqrt_uniform_double(double) nounwind readnone diff --git a/builtins/util.m4 b/builtins/util.m4 index a8340adb..3915bc4e 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4533,31 +4533,29 @@ define_down_avgs() ') define(`rsqrt_double', ` -define double @__rsqrt_uniform_double(double) nounwind alwaysinline readnone +declare double @__rsqrt_safe_uniform_double___und(double, ) +define double @__rsqrt_uniform_double(double, ) nounwind alwaysinline readnone { - %flt = fptrunc double %0 to float - %res = call float @__rsqrt_uniform_float(float %flt) - %dres = fpext float %res to double - ret double %dres + %res = call double @__rsqrt_safe_uniform_double___und(double %0, %1) + ret double %res } -define @__rsqrt_varying_double() nounwind alwaysinline readnone +declare @__rsqrt_safe_varying_double___vyd(, ) +define @__rsqrt_varying_double(, ) nounwind alwaysinline readnone { - %flt = fptrunc %0 to - %res = call @__rsqrt_varying_float( %flt) - %dres = fpext %res to - ret %dres + %res = call @__rsqrt_safe_varying_double___vyd( %0, %1) + ret %res } ') define(`rcp_double', ` -define double @__rcp_uniform_double(double) nounwind alwaysinline readnone +define double @__rcp_uniform_double(double, ) nounwind alwaysinline readnone { %flt = fptrunc double %0 to float %res = call float @__rcp_uniform_float(float %flt) %dres = fpext float %res to double ret double %dres } -define @__rcp_varying_double() nounwind alwaysinline readnone +define @__rcp_varying_double(, ) nounwind alwaysinline readnone { %flt = fptrunc %0 to %res = call @__rcp_varying_float( %flt) diff --git a/stdlib.ispc b/stdlib.ispc index 42b60b52..70b5f0d1 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1393,12 +1393,12 @@ static inline uniform float rcp(uniform float v) { __declspec(safe) static inline double rcp(double v) { - return __rcp_varying_double(v); + return __rcp_varying_double(v, (IntMaskType)__mask); } __declspec(safe) static inline uniform double rcp(uniform double v) { - return __rcp_uniform_double(v); + return __rcp_uniform_double(v, (IntMaskType)__mask); } /////////////////////////////////////////////////////////////////////////// @@ -3527,14 +3527,37 @@ static inline uniform double sqrt(uniform double v) { return __sqrt_uniform_double(v); } -__declspec(safe) -static inline double rsqrt(double v) { - return __rsqrt_varying_double(v); +#define RSQRTD(QUAL) \ +__declspec(safe) \ +static inline QUAL double __rsqrt_iterate_double(QUAL double x, QUAL double y) \ +{ \ + QUAL double xh = x*0.5d; \ + y += y*(0.5d0 - xh*y*y); \ + y += y*(0.5d0 - xh*y*y); \ + return y; \ +} \ +__declspec(safe) \ +static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \ +{ \ + if (x <= 1.0d+33 && x >= 1.0d-33) \ + return __rsqrt_iterate_double(x, rsqrt((QUAL float)x)); \ + QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \ + QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \ + QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \ + QUAL double y = rsqrt((QUAL float)(x*exp)); \ + return __rsqrt_iterate_double(x, y*exph); \ } +RSQRTD(varying) +__declspec(safe) +static inline double rsqrt(double v) { + return __rsqrt_varying_double(v, (IntMaskType)__mask); +} + +RSQRTD(uniform) __declspec(safe) static inline uniform double rsqrt(uniform double v) { - return __rsqrt_uniform_double(v); + return __rsqrt_uniform_double(v, (IntMaskType)__mask); } __declspec(safe) static inline double ldexp(double x, int n) { diff --git a/tests/test-148.ispc b/tests/test-148.ispc index f8284e1a..1c01428c 100644 --- a/tests/test-148.ispc +++ b/tests/test-148.ispc @@ -3,11 +3,11 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - double x = aFOO[programIndex]; + double x = aFOO[programIndex]*1d100; double d, invsqrt = rsqrt(x); d = (x * (invsqrt * invsqrt)) - 1.0d0; if (d < 0.0d0) d = -d; - RET[programIndex] = d > 1d-5; + RET[programIndex] = d > 1d-15; } From fe98fe8cdcca7c2adedeba9fbfff00c3cdeb19e2 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 4 Feb 2014 15:23:34 +0100 Subject: [PATCH 05/13] added fast approximate rcp(double) accurate to 15 digits --- builtins/util.m4 | 18 ++++++++---------- stdlib.ispc | 27 ++++++++++++++++++++++++--- tests/test-147.ispc | 4 ++-- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 3915bc4e..74b5cdd3 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4548,18 +4548,16 @@ define @__rsqrt_varying_double(, ) nounwind alwaysinline readnone +declare double @__rcp_safe_uniform_double___und(double, ) +define double @__rcp_uniform_double(double, ) nounwind alwaysinline readnone { - %flt = fptrunc double %0 to float - %res = call float @__rcp_uniform_float(float %flt) - %dres = fpext float %res to double - ret double %dres + %res = call double @__rcp_safe_uniform_double___und(double %0, %1) + ret double %res } -define @__rcp_varying_double(, ) nounwind alwaysinline readnone +declare @__rcp_safe_varying_double___vyd(, ) +define @__rcp_varying_double(, ) nounwind alwaysinline readnone { - %flt = fptrunc %0 to - %res = call @__rcp_varying_float( %flt) - %dres = fpext %res to - ret %dres + %res = call @__rcp_safe_varying_double___vyd( %0, %1) + ret %res } ') diff --git a/stdlib.ispc b/stdlib.ispc index 70b5f0d1..bd12034c 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1391,11 +1391,32 @@ static inline uniform float rcp(uniform float v) { return __rcp_uniform_float(v); } +#define RCPD(QUAL) \ +__declspec(safe) \ +static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \ +{ \ + iv = iv * (2.0d - v*iv); \ + iv = iv * (2.0d - v*iv); \ + return iv; \ +} \ +__declspec(safe) \ +static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \ +{ \ + if (x <= 1.0d+33 && x >= 1.0d-33) \ + return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \ + QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \ + QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \ + QUAL double y = rcp((QUAL float)(x*exp)); \ + return __rcp_iterate_##QUAL##_double(x, y*exp); \ +} + +RCPD(varying) __declspec(safe) static inline double rcp(double v) { return __rcp_varying_double(v, (IntMaskType)__mask); } +RCPD(uniform) __declspec(safe) static inline uniform double rcp(uniform double v) { return __rcp_uniform_double(v, (IntMaskType)__mask); @@ -3529,7 +3550,7 @@ static inline uniform double sqrt(uniform double v) { #define RSQRTD(QUAL) \ __declspec(safe) \ -static inline QUAL double __rsqrt_iterate_double(QUAL double x, QUAL double y) \ +static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \ { \ QUAL double xh = x*0.5d; \ y += y*(0.5d0 - xh*y*y); \ @@ -3540,12 +3561,12 @@ __declspec(safe) \ static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \ { \ if (x <= 1.0d+33 && x >= 1.0d-33) \ - return __rsqrt_iterate_double(x, rsqrt((QUAL float)x)); \ + return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \ QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \ QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \ QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \ QUAL double y = rsqrt((QUAL float)(x*exp)); \ - return __rsqrt_iterate_double(x, y*exph); \ + return __rsqrt_iterate_##QUAL##_double(x, y*exph); \ } RSQRTD(varying) diff --git a/tests/test-147.ispc b/tests/test-147.ispc index 4091a78d..5c5d4bc1 100644 --- a/tests/test-147.ispc +++ b/tests/test-147.ispc @@ -3,12 +3,12 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - double x = aFOO[programIndex&0x3]; + double x = aFOO[programIndex&0x3]*1d100; double d, ix; ix = rcp(x); d = (ix - 1.0d0 / x); d = (d < 0.0d0) ? -d : d; - RET[programIndex] = (d < 1d-7) ? 1.0d0 : 0.0d0; + RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0; } export void result(uniform float RET[]) { From d3a6693eef93b97a91fde0bcf20388aa95921b6e Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 4 Feb 2014 16:29:23 +0100 Subject: [PATCH 06/13] adding __have_native_{rsqrtd,rcpd} to select between native support for double precision reciprocals and using slower but safe version in stdlib --- builtins.cpp | 4 ++++ builtins/target-generic-common.ll | 8 ++++---- builtins/util.m4 | 28 ++++------------------------ ispc.cpp | 9 ++++++++- ispc.h | 10 ++++++++++ stdlib.ispc | 27 ++++++++++++++++++++------- 6 files changed, 50 insertions(+), 36 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index da66ed5f..fee322e7 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -1150,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(), module, symbolTable); + lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(), + module, symbolTable); + lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(), + module, symbolTable); if (g->forceAlignment != -1) { llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true); diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index dfdc7e9e..401c862d 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -191,13 +191,13 @@ declare @__max_varying_double(, declare float @__rsqrt_uniform_float(float) nounwind readnone declare float @__rcp_uniform_float(float) nounwind readnone -declare double @__rsqrt_uniform_double(double, ) nounwind readnone -declare double @__rcp_uniform_double(double, ) nounwind readnone +declare double @__rsqrt_uniform_double(double) nounwind readnone +declare double @__rcp_uniform_double(double) nounwind readnone declare float @__sqrt_uniform_float(float) nounwind readnone declare @__rcp_varying_float() nounwind readnone declare @__rsqrt_varying_float() nounwind readnone -declare @__rcp_varying_double(, ) nounwind readnone -declare @__rsqrt_varying_double(, ) nounwind readnone +declare @__rcp_varying_double() nounwind readnone +declare @__rsqrt_varying_double() nounwind readnone declare @__sqrt_varying_float() nounwind readnone diff --git a/builtins/util.m4 b/builtins/util.m4 index 74b5cdd3..8a1f280a 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4533,31 +4533,11 @@ define_down_avgs() ') define(`rsqrt_double', ` -declare double @__rsqrt_safe_uniform_double___und(double, ) -define double @__rsqrt_uniform_double(double, ) nounwind alwaysinline readnone -{ - %res = call double @__rsqrt_safe_uniform_double___und(double %0, %1) - ret double %res -} -declare @__rsqrt_safe_varying_double___vyd(, ) -define @__rsqrt_varying_double(, ) nounwind alwaysinline readnone -{ - %res = call @__rsqrt_safe_varying_double___vyd( %0, %1) - ret %res -} +declare double @__rsqrt_uniform_double(double) +declare @__rsqrt_varying_double() ') define(`rcp_double', ` -declare double @__rcp_safe_uniform_double___und(double, ) -define double @__rcp_uniform_double(double, ) nounwind alwaysinline readnone -{ - %res = call double @__rcp_safe_uniform_double___und(double %0, %1) - ret double %res -} -declare @__rcp_safe_varying_double___vyd(, ) -define @__rcp_varying_double(, ) nounwind alwaysinline readnone -{ - %res = call @__rcp_safe_varying_double___vyd( %0, %1) - ret %res -} +declare double @__rcp_uniform_double(double) +declare @__rcp_varying_double() ') diff --git a/ispc.cpp b/ispc.cpp index ed326b14..1386d65e 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_hasRand(false), m_hasGather(false), m_hasScatter(false), - m_hasTranscendentals(false) + m_hasTranscendentals(false), + m_hasRsqrtd(false), + m_hasRcpd(false) { if (isa == NULL) { if (cpu != NULL) { @@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-8") || !strcasecmp(isa, "generic-x8")) { @@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-16") || !strcasecmp(isa, "generic-x16")) { @@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-32") || !strcasecmp(isa, "generic-x32")) { @@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-64") || !strcasecmp(isa, "generic-x64")) { @@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_hasHalf = true; this->m_hasTranscendentals = true; this->m_hasGather = this->m_hasScatter = true; + this->m_hasRsqrtd = this->m_hasRcpd = true; } else if (!strcasecmp(isa, "generic-1") || !strcasecmp(isa, "generic-x1")) { diff --git a/ispc.h b/ispc.h index 88eb8353..4b6df8c3 100644 --- a/ispc.h +++ b/ispc.h @@ -281,6 +281,10 @@ public: bool hasScatter() const {return m_hasScatter;} bool hasTranscendentals() const {return m_hasTranscendentals;} + + bool hasRsqrtd() const {return m_hasRsqrtd;} + + bool hasRcpd() const {return m_hasRcpd;} private: @@ -380,6 +384,12 @@ private: /** Indicates whether the target has support for transcendentals (beyond sqrt, which we assume that all of them handle). */ bool m_hasTranscendentals; + + /** Indicates whether there is an ISA double precision rsqrt. */ + bool m_hasRsqrtd; + + /** Indicates whether there is an ISA double precision rcp. */ + bool m_hasRcpd; }; diff --git a/stdlib.ispc b/stdlib.ispc index bd12034c..2f204aa0 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1412,14 +1412,21 @@ static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \ RCPD(varying) __declspec(safe) -static inline double rcp(double v) { - return __rcp_varying_double(v, (IntMaskType)__mask); -} +__declspec(safe) +static inline double rcp(double v) { + if (__have_native_rcpd) + return __rcp_varying_double(v); + else + return __rcp_safe_varying_double(v); +} RCPD(uniform) -__declspec(safe) +__declspec(safe) static inline uniform double rcp(uniform double v) { - return __rcp_uniform_double(v, (IntMaskType)__mask); + if (__have_native_rcpd) + return __rcp_uniform_double(v); + else + return __rcp_safe_uniform_double(v); } /////////////////////////////////////////////////////////////////////////// @@ -3572,13 +3579,19 @@ static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \ RSQRTD(varying) __declspec(safe) static inline double rsqrt(double v) { - return __rsqrt_varying_double(v, (IntMaskType)__mask); + if (__have_native_rsqrtd) + return __rsqrt_varying_double(v); + else + return __rsqrt_safe_varying_double(v); } RSQRTD(uniform) __declspec(safe) static inline uniform double rsqrt(uniform double v) { - return __rsqrt_uniform_double(v, (IntMaskType)__mask); + if (__have_native_rsqrtd) + return __rsqrt_uniform_double(v); + else + return __rsqrt_safe_uniform_double(v); } __declspec(safe) static inline double ldexp(double x, int n) { From 732a315a4ba6ecb4b20c6352f31f0d551099b3e5 Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 5 Feb 2014 13:04:45 +0100 Subject: [PATCH 07/13] removed __declspec(safe) duplicate --- stdlib.ispc | 1 - 1 file changed, 1 deletion(-) diff --git a/stdlib.ispc b/stdlib.ispc index 2f204aa0..24217cd0 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1411,7 +1411,6 @@ static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \ } RCPD(varying) -__declspec(safe) __declspec(safe) static inline double rcp(double v) { if (__have_native_rcpd) From 09e8381ec7c15d11d2f973cfaf95028e788a2e03 Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 5 Feb 2014 13:05:04 +0100 Subject: [PATCH 08/13] change {rsqrt,rcp}_double to {rsqrt,rcp}d_decl --- builtins/target-avx-x2.ll | 4 ++-- builtins/target-avx.ll | 5 ++--- builtins/target-avx1-i64x4base.ll | 4 ++-- builtins/target-generic-1.ll | 5 ++--- builtins/target-neon-16.ll | 4 ++-- builtins/target-neon-32.ll | 4 ++-- builtins/target-neon-8.ll | 4 ++-- builtins/target-sse2-x2.ll | 4 ++-- builtins/target-sse2.ll | 4 ++-- builtins/target-sse4-16.ll | 5 ++--- builtins/target-sse4-8.ll | 5 ++--- builtins/target-sse4-x2.ll | 5 ++--- builtins/target-sse4.ll | 5 ++--- builtins/util.m4 | 4 ++-- 14 files changed, 28 insertions(+), 34 deletions(-) diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 802f726b..b3a77871 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -691,6 +691,6 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 2abed8fe..9738f9d3 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -562,6 +562,5 @@ gen_scatter(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index 1b1e7c4f..a6601a28 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -511,5 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r ret <4 x double> %call } -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 40d7ae8c..3dcd8373 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -995,6 +995,5 @@ define_avgs() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll index 0f144b66..8e0ef121 100644 --- a/builtins/target-neon-16.ll +++ b/builtins/target-neon-16.ll @@ -519,5 +519,5 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll index d30efff2..d6e861a2 100644 --- a/builtins/target-neon-32.ll +++ b/builtins/target-neon-32.ll @@ -489,5 +489,5 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll index 4b6dfa3a..aaa0a7b7 100644 --- a/builtins/target-neon-8.ll +++ b/builtins/target-neon-8.ll @@ -585,5 +585,5 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 41b4be09..bfb927e5 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -656,5 +656,5 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index ac091cdb..93a8eb93 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -591,5 +591,5 @@ gen_scatter(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 55c94ce6..0de5c1b4 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -492,6 +492,5 @@ define_down_avgs() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 4d81df92..79f44212 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -494,6 +494,5 @@ define_down_avgs() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 09088dc3..ceff27f0 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -595,6 +595,5 @@ define_avgs() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 747a6a58..9e2ac8a5 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -518,6 +518,5 @@ define_avgs() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; reciprocals in double precision, if supported -rsqrt_double() -rcp_double() - +rsqrtd_decl() +rcpd_decl() diff --git a/builtins/util.m4 b/builtins/util.m4 index 8a1f280a..fbd929a1 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4532,12 +4532,12 @@ define_up_avgs() define_down_avgs() ') -define(`rsqrt_double', ` +define(`rsqrtd_decl', ` declare double @__rsqrt_uniform_double(double) declare @__rsqrt_varying_double() ') -define(`rcp_double', ` +define(`rcpd_decl', ` declare double @__rcp_uniform_double(double) declare @__rcp_varying_double() ') From 688d9c9a820c3fc8e34fe833e0fde16db8a62b12 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 5 Feb 2014 13:20:44 +0100 Subject: [PATCH 09/13] added support for rsqrtd/rcpd for generic-*.h --- examples/intrinsics/generic-16.h | 10 ++++++++++ examples/intrinsics/generic-32.h | 10 ++++++++++ examples/intrinsics/generic-64.h | 10 ++++++++++ 3 files changed, 30 insertions(+) diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 0aa8a3f6..3b5c6ec3 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 924b049d..f5bb233c 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index b1451c96..a7148c8b 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) { return 1.f / sqrtf(v); } +static FORCEINLINE double __rsqrt_uniform_double(double v) { + return 1.0 / sqrt(v); +} + static FORCEINLINE float __rcp_uniform_float(float v) { return 1.f / v; } +static FORCEINLINE double __rcp_uniform_double(double v) { + return 1.0 / v; +} + static FORCEINLINE float __sqrt_uniform_float(float v) { return sqrtf(v); } @@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) { } UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float) +UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double) UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float) +UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double) UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float) UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double) From f225b558ec21b07cfbebfe7f84cedd2fb882405a Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 5 Feb 2014 13:42:45 +0100 Subject: [PATCH 10/13] added {rsqrt,rcp}d support for sse4.h --- examples/intrinsics/sse4.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 5dd424d9..45b31be1 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) { return r; } +static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) { + __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000)); + __vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex)); + __vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp)); + __vec4_f yf = __rcp_varying_float(xf); + __vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0); +} + static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) { __m128 rsqrt = _mm_rsqrt_ps(v.v); // Newton-Raphson iteration to improve precision @@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) { __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul); return half_scale; } +static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) { + __vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000)); + __vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex)); + __vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp)); + __vec4_f yf = __rsqrt_varying_float(xf); + __vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exph); + __vec4_d xh = __mul(x, __smear_double<__vec4_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0); +} static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) { return _mm_sqrt_ps(v.v); From eb01ffd4e6b11e0e3679fe59d316cfb629cb283d Mon Sep 17 00:00:00 2001 From: Evghenii Date: Wed, 5 Feb 2014 13:43:07 +0100 Subject: [PATCH 11/13] first commit for {rsqrt,rcp}d knc support. going to test on other node now --- examples/intrinsics/knc-i1x16.h | 31 +++++++++++++++++++++++++++++++ examples/intrinsics/knc-i1x8.h | 30 ++++++++++++++++++++++++++++++ examples/intrinsics/knc.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index 141c47bb..ba6ef005 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rcp_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); +} static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { @@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rsqrt_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); + __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);} static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));} diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 32f39c4a..4a5b28a9 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) { return _mm512_mask_recip_ps(FZERO, 0xFF, v); #endif } +static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rcp_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { #ifdef ISPC_FAST_MATH @@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) { return _mm512_mask_invsqrt_ps(FZERO,0xFF,v); #endif } +static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) { + __vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000)); + __vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex)); + __vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp)); + __vec8_f yf = __rsqrt_varying_float(xf); + __vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exph); + __vec8_d xh = __mul(x, __smear_double<__vec8_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0); +} static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);} #endif diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 0077ad88..72951845 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1472,6 +1472,21 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { return _mm512_recip_ps(v); #endif } +static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rcp_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); + return y; +} +static FORCEINLINE double __rcp_uniform_double(double v) +{ + return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #ifdef ISPC_FAST_MATH @@ -1480,6 +1495,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { return _mm512_invsqrt_ps(v); #endif } +static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { + __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); + __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); + __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); + __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); + __vec16_f yf = __rsqrt_varying_float(xf); + __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); + __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); + return y; +} +static FORCEINLINE double __rsqrt_uniform_double(double v) +{ + return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); +} + static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { return _mm512_exp_ps(v); From ecc9c88ff80ed44fdb31ff468103192ad4bc1e3b Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 5 Feb 2014 13:52:24 +0100 Subject: [PATCH 12/13] fix packed_store_active2 for knc-i1x8.h --- examples/intrinsics/knc-i1x8.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index 4a5b28a9..478ad75a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2526,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } -static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val, + __vec8_i1 mask) { return __packed_store_active(ptr, val, mask); } static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, @@ -2538,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { return __packed_store_active((uint32_t *)p, val, mask); } -static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, - __vec4_i1 mask) { +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val, + __vec8_i1 mask) { return __packed_store_active(ptr, val, mask); } From c59cff396d4df0ffc74e9425072699b5ea750b7d Mon Sep 17 00:00:00 2001 From: evghenii Date: Wed, 5 Feb 2014 13:55:38 +0100 Subject: [PATCH 13/13] added {rsqrt,rcp}d support for knc.h. test-147.ispc & test-148.ispc pass. --- examples/intrinsics/knc.h | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 72951845..458da458 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1473,18 +1473,14 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) { #endif } static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) { - __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); - __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); - __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); - __vec16_f yf = __rcp_varying_float(xf); - __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp); - y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); - y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y)))); - return y; + __vec16_d y; + for (int i = 0; i < 16; i++) + __insert_element(&y, i, 1.0/__extract_element(x,i)); + return y; } static FORCEINLINE double __rcp_uniform_double(double v) { - return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0); + return 1.0/v; } @@ -1496,20 +1492,14 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) { #endif } static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) { - __vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000)); - __vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex)); - __vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1))); - __vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp)); - __vec16_f yf = __rsqrt_varying_float(xf); - __vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph); - __vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5)); - y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); - y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y))))); - return y; + __vec16_d y; + for (int i = 0; i < 16; i++) + __insert_element(&y, i, 1.0/sqrt(__extract_element(x,i))); + return y; } static FORCEINLINE double __rsqrt_uniform_double(double v) { - return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0); + return 1.0/v; }