Merge pull request #730 from egaburov/double_math
Added double precision support for reciprocals: rsqrt rcp
This commit is contained in:
@@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__rcp_uniform_double",
|
||||
"__rcp_varying_double",
|
||||
"__rdrand_i16",
|
||||
"__rdrand_i32",
|
||||
"__rdrand_i64",
|
||||
@@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__round_varying_float",
|
||||
"__rsqrt_uniform_float",
|
||||
"__rsqrt_varying_float",
|
||||
"__rsqrt_uniform_double",
|
||||
"__rsqrt_varying_double",
|
||||
"__set_system_isa",
|
||||
"__sext_uniform_bool",
|
||||
"__sext_varying_bool",
|
||||
@@ -1146,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
|
||||
module, symbolTable);
|
||||
lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(),
|
||||
module, symbolTable);
|
||||
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
|
||||
module, symbolTable);
|
||||
|
||||
if (g->forceAlignment != -1) {
|
||||
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
||||
|
||||
@@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
|
||||
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
|
||||
ret <16 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
|
||||
@@ -559,3 +559,8 @@ gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
|
||||
ret <4 x double> %call
|
||||
}
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -992,3 +992,8 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
|
||||
|
||||
define_avgs()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -191,9 +191,14 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
|
||||
|
||||
declare float @__rsqrt_uniform_float(float) nounwind readnone
|
||||
declare float @__rcp_uniform_float(float) nounwind readnone
|
||||
declare double @__rsqrt_uniform_double(double) nounwind readnone
|
||||
declare double @__rcp_uniform_double(double) nounwind readnone
|
||||
declare float @__sqrt_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone
|
||||
declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone
|
||||
|
||||
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
|
||||
|
||||
declare double @__sqrt_uniform_double(double) nounwind readnone
|
||||
|
||||
@@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -587,3 +587,9 @@ gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -488,3 +488,9 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -490,3 +490,9 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -592,3 +592,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
|
||||
|
||||
define_avgs()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -515,3 +515,8 @@ gen_scatter(double)
|
||||
|
||||
define_avgs()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; reciprocals in double precision, if supported
|
||||
|
||||
rsqrtd_decl()
|
||||
rcpd_decl()
|
||||
|
||||
@@ -4531,3 +4531,13 @@ define(`define_avgs', `
|
||||
define_up_avgs()
|
||||
define_down_avgs()
|
||||
')
|
||||
|
||||
define(`rsqrtd_decl', `
|
||||
declare double @__rsqrt_uniform_double(double)
|
||||
declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>)
|
||||
')
|
||||
|
||||
define(`rcpd_decl', `
|
||||
declare double @__rcp_uniform_double(double)
|
||||
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
|
||||
')
|
||||
|
||||
@@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
|
||||
return 1.f / sqrtf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v) {
|
||||
return 1.0 / sqrt(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE float __rcp_uniform_float(float v) {
|
||||
return 1.f / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rcp_uniform_double(double v) {
|
||||
return 1.0 / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __sqrt_uniform_float(float v) {
|
||||
return sqrtf(v);
|
||||
}
|
||||
@@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
|
||||
}
|
||||
|
||||
UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
|
||||
UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double)
|
||||
UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
|
||||
UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double)
|
||||
UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
|
||||
UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)
|
||||
|
||||
|
||||
@@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
|
||||
return 1.f / sqrtf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v) {
|
||||
return 1.0 / sqrt(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE float __rcp_uniform_float(float v) {
|
||||
return 1.f / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rcp_uniform_double(double v) {
|
||||
return 1.0 / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __sqrt_uniform_float(float v) {
|
||||
return sqrtf(v);
|
||||
}
|
||||
@@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
|
||||
}
|
||||
|
||||
UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float)
|
||||
UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double)
|
||||
UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float)
|
||||
UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double)
|
||||
UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float)
|
||||
UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double)
|
||||
|
||||
|
||||
@@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
|
||||
return 1.f / sqrtf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v) {
|
||||
return 1.0 / sqrt(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE float __rcp_uniform_float(float v) {
|
||||
return 1.f / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE double __rcp_uniform_double(double v) {
|
||||
return 1.0 / v;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __sqrt_uniform_float(float v) {
|
||||
return sqrtf(v);
|
||||
}
|
||||
@@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
|
||||
}
|
||||
|
||||
UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float)
|
||||
UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double)
|
||||
UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float)
|
||||
UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double)
|
||||
UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float)
|
||||
UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double)
|
||||
|
||||
|
||||
@@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v)
|
||||
return _mm512_recip_ps(v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
|
||||
__vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
|
||||
__vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
|
||||
__vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
|
||||
__vec16_f yf = __rcp_varying_float(xf);
|
||||
__vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp);
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rcp_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
|
||||
{
|
||||
@@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
|
||||
return _mm512_invsqrt_ps(v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
|
||||
__vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
|
||||
__vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
|
||||
__vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
|
||||
__vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
|
||||
__vec16_f yf = __rsqrt_varying_float(xf);
|
||||
__vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph);
|
||||
__vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
|
||||
static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}
|
||||
|
||||
|
||||
@@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
|
||||
return _mm512_mask_recip_ps(FZERO, 0xFF, v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) {
|
||||
__vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
|
||||
__vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
|
||||
__vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
|
||||
__vec8_f yf = __rcp_varying_float(xf);
|
||||
__vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exp);
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rcp_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
|
||||
#ifdef ISPC_FAST_MATH
|
||||
@@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
|
||||
return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) {
|
||||
__vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
|
||||
__vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
|
||||
__vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1)));
|
||||
__vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
|
||||
__vec8_f yf = __rsqrt_varying_float(xf);
|
||||
__vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exph);
|
||||
__vec8_d xh = __mul(x, __smear_double<__vec8_d>(0.5));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0);
|
||||
}
|
||||
static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
|
||||
#endif
|
||||
|
||||
@@ -2496,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
|
||||
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
return _mm_countbits_32(uint32_t(0xFF & mask));
|
||||
}
|
||||
static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val,
|
||||
__vec8_i1 mask) {
|
||||
return __packed_store_active(ptr, val, mask);
|
||||
}
|
||||
static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
|
||||
@@ -2508,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
|
||||
__vec8_i1 mask) {
|
||||
return __packed_store_active((uint32_t *)p, val, mask);
|
||||
}
|
||||
static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val,
|
||||
__vec8_i1 mask) {
|
||||
return __packed_store_active(ptr, val, mask);
|
||||
}
|
||||
|
||||
|
||||
@@ -1472,6 +1472,17 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
|
||||
return _mm512_recip_ps(v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
|
||||
__vec16_d y;
|
||||
for (int i = 0; i < 16; i++)
|
||||
__insert_element(&y, i, 1.0/__extract_element(x,i));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rcp_uniform_double(double v)
|
||||
{
|
||||
return 1.0/v;
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
||||
#ifdef ISPC_FAST_MATH
|
||||
@@ -1480,6 +1491,17 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
|
||||
return _mm512_invsqrt_ps(v);
|
||||
#endif
|
||||
}
|
||||
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
|
||||
__vec16_d y;
|
||||
for (int i = 0; i < 16; i++)
|
||||
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||
{
|
||||
return 1.0/v;
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
|
||||
return _mm512_exp_ps(v);
|
||||
|
||||
@@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) {
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) {
|
||||
__vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
|
||||
__vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
|
||||
__vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
|
||||
__vec4_f yf = __rcp_varying_float(xf);
|
||||
__vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exp);
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rcp_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
|
||||
__m128 rsqrt = _mm_rsqrt_ps(v.v);
|
||||
// Newton-Raphson iteration to improve precision
|
||||
@@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
|
||||
__m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
|
||||
return half_scale;
|
||||
}
|
||||
static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) {
|
||||
__vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
|
||||
__vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
|
||||
__vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1)));
|
||||
__vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
|
||||
__vec4_f yf = __rsqrt_varying_float(xf);
|
||||
__vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exph);
|
||||
__vec4_d xh = __mul(x, __smear_double<__vec4_d>(0.5));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
|
||||
return y;
|
||||
}
|
||||
static FORCEINLINE double __rsqrt_uniform_double(double v)
|
||||
{
|
||||
return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) {
|
||||
return _mm_sqrt_ps(v.v);
|
||||
|
||||
9
ispc.cpp
9
ispc.cpp
@@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
m_hasRand(false),
|
||||
m_hasGather(false),
|
||||
m_hasScatter(false),
|
||||
m_hasTranscendentals(false)
|
||||
m_hasTranscendentals(false),
|
||||
m_hasRsqrtd(false),
|
||||
m_hasRcpd(false)
|
||||
{
|
||||
if (isa == NULL) {
|
||||
if (cpu != NULL) {
|
||||
@@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasHalf = true;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-8") ||
|
||||
!strcasecmp(isa, "generic-x8")) {
|
||||
@@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasHalf = true;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-16") ||
|
||||
!strcasecmp(isa, "generic-x16")) {
|
||||
@@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasHalf = true;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-32") ||
|
||||
!strcasecmp(isa, "generic-x32")) {
|
||||
@@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasHalf = true;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-64") ||
|
||||
!strcasecmp(isa, "generic-x64")) {
|
||||
@@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasHalf = true;
|
||||
this->m_hasTranscendentals = true;
|
||||
this->m_hasGather = this->m_hasScatter = true;
|
||||
this->m_hasRsqrtd = this->m_hasRcpd = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-1") ||
|
||||
!strcasecmp(isa, "generic-x1")) {
|
||||
|
||||
10
ispc.h
10
ispc.h
@@ -281,6 +281,10 @@ public:
|
||||
bool hasScatter() const {return m_hasScatter;}
|
||||
|
||||
bool hasTranscendentals() const {return m_hasTranscendentals;}
|
||||
|
||||
bool hasRsqrtd() const {return m_hasRsqrtd;}
|
||||
|
||||
bool hasRcpd() const {return m_hasRcpd;}
|
||||
|
||||
private:
|
||||
|
||||
@@ -380,6 +384,12 @@ private:
|
||||
/** Indicates whether the target has support for transcendentals (beyond
|
||||
sqrt, which we assume that all of them handle). */
|
||||
bool m_hasTranscendentals;
|
||||
|
||||
/** Indicates whether there is an ISA double precision rsqrt. */
|
||||
bool m_hasRsqrtd;
|
||||
|
||||
/** Indicates whether there is an ISA double precision rcp. */
|
||||
bool m_hasRcpd;
|
||||
};
|
||||
|
||||
|
||||
|
||||
75
stdlib.ispc
75
stdlib.ispc
@@ -1391,6 +1391,43 @@ static inline uniform float rcp(uniform float v) {
|
||||
return __rcp_uniform_float(v);
|
||||
}
|
||||
|
||||
#define RCPD(QUAL) \
|
||||
__declspec(safe) \
|
||||
static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
|
||||
{ \
|
||||
iv = iv * (2.0d - v*iv); \
|
||||
iv = iv * (2.0d - v*iv); \
|
||||
return iv; \
|
||||
} \
|
||||
__declspec(safe) \
|
||||
static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
|
||||
{ \
|
||||
if (x <= 1.0d+33 && x >= 1.0d-33) \
|
||||
return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
|
||||
QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
|
||||
QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \
|
||||
QUAL double y = rcp((QUAL float)(x*exp)); \
|
||||
return __rcp_iterate_##QUAL##_double(x, y*exp); \
|
||||
}
|
||||
|
||||
RCPD(varying)
|
||||
__declspec(safe)
|
||||
static inline double rcp(double v) {
|
||||
if (__have_native_rcpd)
|
||||
return __rcp_varying_double(v);
|
||||
else
|
||||
return __rcp_safe_varying_double(v);
|
||||
}
|
||||
|
||||
RCPD(uniform)
|
||||
__declspec(safe)
|
||||
static inline uniform double rcp(uniform double v) {
|
||||
if (__have_native_rcpd)
|
||||
return __rcp_uniform_double(v);
|
||||
else
|
||||
return __rcp_safe_uniform_double(v);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// min/max
|
||||
|
||||
@@ -3517,6 +3554,44 @@ static inline uniform double sqrt(uniform double v) {
|
||||
return __sqrt_uniform_double(v);
|
||||
}
|
||||
|
||||
#define RSQRTD(QUAL) \
|
||||
__declspec(safe) \
|
||||
static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \
|
||||
{ \
|
||||
QUAL double xh = x*0.5d; \
|
||||
y += y*(0.5d0 - xh*y*y); \
|
||||
y += y*(0.5d0 - xh*y*y); \
|
||||
return y; \
|
||||
} \
|
||||
__declspec(safe) \
|
||||
static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \
|
||||
{ \
|
||||
if (x <= 1.0d+33 && x >= 1.0d-33) \
|
||||
return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \
|
||||
QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
|
||||
QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \
|
||||
QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \
|
||||
QUAL double y = rsqrt((QUAL float)(x*exp)); \
|
||||
return __rsqrt_iterate_##QUAL##_double(x, y*exph); \
|
||||
}
|
||||
|
||||
RSQRTD(varying)
|
||||
__declspec(safe)
|
||||
static inline double rsqrt(double v) {
|
||||
if (__have_native_rsqrtd)
|
||||
return __rsqrt_varying_double(v);
|
||||
else
|
||||
return __rsqrt_safe_varying_double(v);
|
||||
}
|
||||
|
||||
RSQRTD(uniform)
|
||||
__declspec(safe)
|
||||
static inline uniform double rsqrt(uniform double v) {
|
||||
if (__have_native_rsqrtd)
|
||||
return __rsqrt_uniform_double(v);
|
||||
else
|
||||
return __rsqrt_safe_uniform_double(v);
|
||||
}
|
||||
__declspec(safe)
|
||||
static inline double ldexp(double x, int n) {
|
||||
unsigned int64 ex = 0x7ff0000000000000;
|
||||
|
||||
16
tests/test-147.ispc
Normal file
16
tests/test-147.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double x = aFOO[programIndex&0x3]*1d100;
|
||||
double d, ix;
|
||||
ix = rcp(x);
|
||||
d = (ix - 1.0d0 / x);
|
||||
d = (d < 0.0d0) ? -d : d;
|
||||
RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1.0d0;
|
||||
}
|
||||
16
tests/test-148.ispc
Normal file
16
tests/test-148.ispc
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
double x = aFOO[programIndex]*1d100;
|
||||
double d, invsqrt = rsqrt(x);
|
||||
d = (x * (invsqrt * invsqrt)) - 1.0d0;
|
||||
if (d < 0.0d0) d = -d;
|
||||
RET[programIndex] = d > 1d-15;
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
Reference in New Issue
Block a user