Merge pull request #730 from egaburov/double_math

Added double precision support for reciprocals: rsqrt rcp
This commit is contained in:
Dmitry Babokin
2014-02-05 17:57:39 +03:00
28 changed files with 367 additions and 5 deletions

View File

@@ -497,6 +497,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_nt",
"__rcp_uniform_float",
"__rcp_varying_float",
"__rcp_uniform_double",
"__rcp_varying_double",
"__rdrand_i16",
"__rdrand_i32",
"__rdrand_i64",
@@ -534,6 +536,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__round_varying_float",
"__rsqrt_uniform_float",
"__rsqrt_varying_float",
"__rsqrt_uniform_double",
"__rsqrt_varying_double",
"__set_system_isa",
"__sext_uniform_bool",
"__sext_varying_bool",
@@ -1146,6 +1150,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
symbolTable);
lDefineConstantInt("__have_native_transcendentals", g->target->hasTranscendentals(),
module, symbolTable);
lDefineConstantInt("__have_native_rsqrtd", g->target->hasRsqrtd(),
module, symbolTable);
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
module, symbolTable);
if (g->forceAlignment != -1) {
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);

View File

@@ -687,3 +687,10 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
binary4to16(ret, double, @llvm.x86.avx.max.pd.256, %0, %1)
ret <16 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -559,3 +559,8 @@ gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -511,3 +511,5 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
ret <4 x double> %call
}
rsqrtd_decl()
rcpd_decl()

View File

@@ -992,3 +992,8 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
define_avgs()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -191,9 +191,14 @@ declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
declare float @__rsqrt_uniform_float(float) nounwind readnone
declare float @__rcp_uniform_float(float) nounwind readnone
declare double @__rsqrt_uniform_double(double) nounwind readnone
declare double @__rcp_uniform_double(double) nounwind readnone
declare float @__sqrt_uniform_float(float) nounwind readnone
declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone
declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
declare double @__sqrt_uniform_double(double) nounwind readnone

View File

@@ -515,3 +515,9 @@ define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -485,3 +485,9 @@ define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -581,3 +581,9 @@ define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -652,3 +652,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -587,3 +587,9 @@ gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -488,3 +488,9 @@ define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -490,3 +490,9 @@ define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -592,3 +592,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
define_avgs()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -515,3 +515,8 @@ gen_scatter(double)
define_avgs()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; reciprocals in double precision, if supported
rsqrtd_decl()
rcpd_decl()

View File

@@ -4531,3 +4531,13 @@ define(`define_avgs', `
define_up_avgs()
define_down_avgs()
')
define(`rsqrtd_decl', `
declare double @__rsqrt_uniform_double(double)
declare <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>)
')
define(`rcpd_decl', `
declare double @__rcp_uniform_double(double)
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
')

View File

@@ -1086,10 +1086,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
return 1.f / sqrtf(v);
}
static FORCEINLINE double __rsqrt_uniform_double(double v) {
return 1.0 / sqrt(v);
}
static FORCEINLINE float __rcp_uniform_float(float v) {
return 1.f / v;
}
static FORCEINLINE double __rcp_uniform_double(double v) {
return 1.0 / v;
}
static FORCEINLINE float __sqrt_uniform_float(float v) {
return sqrtf(v);
}
@@ -1099,7 +1107,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
}
UNARY_OP(__vec16_f, __rcp_varying_float, __rcp_uniform_float)
UNARY_OP(__vec16_d, __rcp_varying_double, __rcp_uniform_double)
UNARY_OP(__vec16_f, __rsqrt_varying_float, __rsqrt_uniform_float)
UNARY_OP(__vec16_d, __rsqrt_varying_double, __rsqrt_uniform_double)
UNARY_OP(__vec16_f, __sqrt_varying_float, __sqrt_uniform_float)
UNARY_OP(__vec16_d, __sqrt_varying_double, __sqrt_uniform_double)

View File

@@ -1138,10 +1138,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
return 1.f / sqrtf(v);
}
static FORCEINLINE double __rsqrt_uniform_double(double v) {
return 1.0 / sqrt(v);
}
static FORCEINLINE float __rcp_uniform_float(float v) {
return 1.f / v;
}
static FORCEINLINE double __rcp_uniform_double(double v) {
return 1.0 / v;
}
static FORCEINLINE float __sqrt_uniform_float(float v) {
return sqrtf(v);
}
@@ -1151,7 +1159,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
}
UNARY_OP(__vec32_f, __rcp_varying_float, __rcp_uniform_float)
UNARY_OP(__vec32_d, __rcp_varying_double, __rcp_uniform_double)
UNARY_OP(__vec32_f, __rsqrt_varying_float, __rsqrt_uniform_float)
UNARY_OP(__vec32_d, __rsqrt_varying_double, __rsqrt_uniform_double)
UNARY_OP(__vec32_f, __sqrt_varying_float, __sqrt_uniform_float)
UNARY_OP(__vec32_d, __sqrt_varying_double, __sqrt_uniform_double)

View File

@@ -1271,10 +1271,18 @@ static FORCEINLINE float __rsqrt_uniform_float(float v) {
return 1.f / sqrtf(v);
}
static FORCEINLINE double __rsqrt_uniform_double(double v) {
return 1.0 / sqrt(v);
}
static FORCEINLINE float __rcp_uniform_float(float v) {
return 1.f / v;
}
static FORCEINLINE double __rcp_uniform_double(double v) {
return 1.0 / v;
}
static FORCEINLINE float __sqrt_uniform_float(float v) {
return sqrtf(v);
}
@@ -1284,7 +1292,9 @@ static FORCEINLINE double __sqrt_uniform_double(double v) {
}
UNARY_OP(__vec64_f, __rcp_varying_float, __rcp_uniform_float)
UNARY_OP(__vec64_d, __rcp_varying_double, __rcp_uniform_double)
UNARY_OP(__vec64_f, __rsqrt_varying_float, __rsqrt_uniform_float)
UNARY_OP(__vec64_d, __rsqrt_varying_double, __rsqrt_uniform_double)
UNARY_OP(__vec64_f, __sqrt_varying_float, __sqrt_uniform_float)
UNARY_OP(__vec64_d, __sqrt_varying_double, __sqrt_uniform_double)

View File

@@ -1811,6 +1811,20 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v)
return _mm512_recip_ps(v);
#endif
}
static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
__vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
__vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
__vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
__vec16_f yf = __rcp_varying_float(xf);
__vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exp);
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(2.0), __mul(x, y))));
return y;
}
static FORCEINLINE double __rcp_uniform_double(double v)
{
return __extract_element(__rcp_varying_double(__smear_double<__vec16_d>(v)),0);
}
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
{
@@ -1820,6 +1834,23 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v)
return _mm512_invsqrt_ps(v);
#endif
}
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
__vec16_i64 ex = __and(__cast_bits(__vec16_i64(), x), __smear_i64<__vec16_i64>(0x7fe0000000000000));
__vec16_d exp = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x7fd0000000000000), ex));
__vec16_d exph = __cast_bits(__vec16_d(), __sub(__smear_i64<__vec16_i64>(0x5fe0000000000000), __lshr(ex,1)));
__vec16_f xf = __cast_fptrunc(__vec16_f(), __mul(x, exp));
__vec16_f yf = __rsqrt_varying_float(xf);
__vec16_d y = __mul(__cast_fpext(__vec16_d(), yf), exph);
__vec16_d xh = __mul(x, __smear_double<__vec16_d>(0.5));
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
y = __add(y, __mul(y, __sub(__smear_double<__vec16_d>(0.5), __mul(xh, __mul(y,y)))));
return y;
}
static FORCEINLINE double __rsqrt_uniform_double(double v)
{
return __extract_element(__rsqrt_varying_double(__smear_double<__vec16_d>(v)),0);
}
static FORCEINLINE __vec16_f __sqrt_varying_float (__vec16_f v) { return _mm512_sqrt_ps(v);}
static FORCEINLINE __vec16_d __sqrt_varying_double(__vec16_d v) { return __vec16_d(_mm512_sqrt_pd(v.v1),_mm512_sqrt_pd(v.v2));}

View File

@@ -1836,6 +1836,20 @@ static FORCEINLINE __vec8_f __rcp_varying_float(__vec8_f v) {
return _mm512_mask_recip_ps(FZERO, 0xFF, v);
#endif
}
static FORCEINLINE __vec8_d __rcp_varying_double(__vec8_d x) {
__vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
__vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
__vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
__vec8_f yf = __rcp_varying_float(xf);
__vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exp);
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(2.0), __mul(x, y))));
return y;
}
static FORCEINLINE double __rcp_uniform_double(double v)
{
return __extract_element(__rcp_varying_double(__smear_double<__vec8_d>(v)),0);
}
static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
#ifdef ISPC_FAST_MATH
@@ -1844,6 +1858,22 @@ static FORCEINLINE __vec8_f __rsqrt_varying_float(__vec8_f v) {
return _mm512_mask_invsqrt_ps(FZERO,0xFF,v);
#endif
}
static FORCEINLINE __vec8_d __rsqrt_varying_double(__vec8_d x) {
__vec8_i64 ex = __and(__cast_bits(__vec8_i64(), x), __smear_i64<__vec8_i64>(0x7fe0000000000000));
__vec8_d exp = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x7fd0000000000000), ex));
__vec8_d exph = __cast_bits(__vec8_d(), __sub(__smear_i64<__vec8_i64>(0x5fe0000000000000), __lshr(ex,1)));
__vec8_f xf = __cast_fptrunc(__vec8_f(), __mul(x, exp));
__vec8_f yf = __rsqrt_varying_float(xf);
__vec8_d y = __mul(__cast_fpext(__vec8_d(), yf), exph);
__vec8_d xh = __mul(x, __smear_double<__vec8_d>(0.5));
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
y = __add(y, __mul(y, __sub(__smear_double<__vec8_d>(0.5), __mul(xh, __mul(y,y)))));
return y;
}
static FORCEINLINE double __rsqrt_uniform_double(double v)
{
return __extract_element(__rsqrt_varying_double(__smear_double<__vec8_d>(v)),0);
}
static FORCEINLINE __vec8_f __sqrt_varying_float (__vec8_f v) { return _mm512_mask_sqrt_ps(FZERO,0xFF,v);}
#endif
@@ -2496,8 +2526,8 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val,
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
return _mm_countbits_32(uint32_t(0xFF & mask));
}
static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val,
__vec4_i1 mask) {
static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec8_i32 val,
__vec8_i1 mask) {
return __packed_store_active(ptr, val, mask);
}
static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val,
@@ -2508,8 +2538,8 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val,
__vec8_i1 mask) {
return __packed_store_active((uint32_t *)p, val, mask);
}
static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val,
__vec4_i1 mask) {
static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec8_i32 val,
__vec8_i1 mask) {
return __packed_store_active(ptr, val, mask);
}

View File

@@ -1472,6 +1472,17 @@ static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
return _mm512_recip_ps(v);
#endif
}
static FORCEINLINE __vec16_d __rcp_varying_double(__vec16_d x) {
__vec16_d y;
for (int i = 0; i < 16; i++)
__insert_element(&y, i, 1.0/__extract_element(x,i));
return y;
}
static FORCEINLINE double __rcp_uniform_double(double v)
{
return 1.0/v;
}
static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
#ifdef ISPC_FAST_MATH
@@ -1480,6 +1491,17 @@ static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
return _mm512_invsqrt_ps(v);
#endif
}
static FORCEINLINE __vec16_d __rsqrt_varying_double(__vec16_d x) {
__vec16_d y;
for (int i = 0; i < 16; i++)
__insert_element(&y, i, 1.0/sqrt(__extract_element(x,i)));
return y;
}
static FORCEINLINE double __rsqrt_uniform_double(double v)
{
return 1.0/v;
}
static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
return _mm512_exp_ps(v);

View File

@@ -2420,6 +2420,21 @@ static FORCEINLINE __vec4_f __rcp_varying_float(__vec4_f v) {
return r;
}
static FORCEINLINE __vec4_d __rcp_varying_double(__vec4_d x) {
__vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
__vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
__vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
__vec4_f yf = __rcp_varying_float(xf);
__vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exp);
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(2.0), __mul(x, y))));
return y;
}
static FORCEINLINE double __rcp_uniform_double(double v)
{
return __extract_element(__rcp_varying_double(__smear_double<__vec4_d>(v)),0);
}
static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
__m128 rsqrt = _mm_rsqrt_ps(v.v);
// Newton-Raphson iteration to improve precision
@@ -2431,6 +2446,22 @@ static FORCEINLINE __vec4_f __rsqrt_varying_float(__vec4_f v) {
__m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
return half_scale;
}
static FORCEINLINE __vec4_d __rsqrt_varying_double(__vec4_d x) {
__vec4_i64 ex = __and(__cast_bits(__vec4_i64(), x), __smear_i64<__vec4_i64>(0x7fe0000000000000));
__vec4_d exp = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x7fd0000000000000), ex));
__vec4_d exph = __cast_bits(__vec4_d(), __sub(__smear_i64<__vec4_i64>(0x5fe0000000000000), __lshr(ex,1)));
__vec4_f xf = __cast_fptrunc(__vec4_f(), __mul(x, exp));
__vec4_f yf = __rsqrt_varying_float(xf);
__vec4_d y = __mul(__cast_fpext(__vec4_d(), yf), exph);
__vec4_d xh = __mul(x, __smear_double<__vec4_d>(0.5));
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
y = __add(y, __mul(y, __sub(__smear_double<__vec4_d>(0.5), __mul(xh, __mul(y,y)))));
return y;
}
static FORCEINLINE double __rsqrt_uniform_double(double v)
{
return __extract_element(__rsqrt_varying_double(__smear_double<__vec4_d>(v)),0);
}
static FORCEINLINE __vec4_f __sqrt_varying_float(__vec4_f v) {
return _mm_sqrt_ps(v.v);

View File

@@ -201,7 +201,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_hasRand(false),
m_hasGather(false),
m_hasScatter(false),
m_hasTranscendentals(false)
m_hasTranscendentals(false),
m_hasRsqrtd(false),
m_hasRcpd(false)
{
if (isa == NULL) {
if (cpu != NULL) {
@@ -419,6 +421,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasHalf = true;
this->m_hasTranscendentals = true;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
}
else if (!strcasecmp(isa, "generic-8") ||
!strcasecmp(isa, "generic-x8")) {
@@ -431,6 +434,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasHalf = true;
this->m_hasTranscendentals = true;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
}
else if (!strcasecmp(isa, "generic-16") ||
!strcasecmp(isa, "generic-x16")) {
@@ -443,6 +447,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasHalf = true;
this->m_hasTranscendentals = true;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
}
else if (!strcasecmp(isa, "generic-32") ||
!strcasecmp(isa, "generic-x32")) {
@@ -455,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasHalf = true;
this->m_hasTranscendentals = true;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
}
else if (!strcasecmp(isa, "generic-64") ||
!strcasecmp(isa, "generic-x64")) {
@@ -467,6 +473,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasHalf = true;
this->m_hasTranscendentals = true;
this->m_hasGather = this->m_hasScatter = true;
this->m_hasRsqrtd = this->m_hasRcpd = true;
}
else if (!strcasecmp(isa, "generic-1") ||
!strcasecmp(isa, "generic-x1")) {

10
ispc.h
View File

@@ -281,6 +281,10 @@ public:
bool hasScatter() const {return m_hasScatter;}
bool hasTranscendentals() const {return m_hasTranscendentals;}
bool hasRsqrtd() const {return m_hasRsqrtd;}
bool hasRcpd() const {return m_hasRcpd;}
private:
@@ -380,6 +384,12 @@ private:
/** Indicates whether the target has support for transcendentals (beyond
sqrt, which we assume that all of them handle). */
bool m_hasTranscendentals;
/** Indicates whether there is an ISA double precision rsqrt. */
bool m_hasRsqrtd;
/** Indicates whether there is an ISA double precision rcp. */
bool m_hasRcpd;
};

View File

@@ -1391,6 +1391,43 @@ static inline uniform float rcp(uniform float v) {
return __rcp_uniform_float(v);
}
#define RCPD(QUAL) \
__declspec(safe) \
static inline QUAL double __rcp_iterate_##QUAL##_double(QUAL double v, QUAL double iv) \
{ \
iv = iv * (2.0d - v*iv); \
iv = iv * (2.0d - v*iv); \
return iv; \
} \
__declspec(safe) \
static inline QUAL double __rcp_safe_##QUAL##_double(QUAL double x) \
{ \
if (x <= 1.0d+33 && x >= 1.0d-33) \
return __rcp_iterate_##QUAL##_double(x, rcp((QUAL float)x)); \
QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
QUAL double exp = doublebits( 0x7fd0000000000000 + ~ex ); \
QUAL double y = rcp((QUAL float)(x*exp)); \
return __rcp_iterate_##QUAL##_double(x, y*exp); \
}
RCPD(varying)
__declspec(safe)
static inline double rcp(double v) {
if (__have_native_rcpd)
return __rcp_varying_double(v);
else
return __rcp_safe_varying_double(v);
}
RCPD(uniform)
__declspec(safe)
static inline uniform double rcp(uniform double v) {
if (__have_native_rcpd)
return __rcp_uniform_double(v);
else
return __rcp_safe_uniform_double(v);
}
///////////////////////////////////////////////////////////////////////////
// min/max
@@ -3517,6 +3554,44 @@ static inline uniform double sqrt(uniform double v) {
return __sqrt_uniform_double(v);
}
#define RSQRTD(QUAL) \
__declspec(safe) \
static inline QUAL double __rsqrt_iterate_##QUAL##_double(QUAL double x, QUAL double y) \
{ \
QUAL double xh = x*0.5d; \
y += y*(0.5d0 - xh*y*y); \
y += y*(0.5d0 - xh*y*y); \
return y; \
} \
__declspec(safe) \
static inline QUAL double __rsqrt_safe_##QUAL##_double (QUAL double x) \
{ \
if (x <= 1.0d+33 && x >= 1.0d-33) \
return __rsqrt_iterate_##QUAL##_double(x, rsqrt((QUAL float)x)); \
QUAL int64 ex = intbits(x) & 0x7fe0000000000000; \
QUAL double exp = doublebits( 0x7fd0000000000000 - ex ); /* 1.0d/exponent */ \
QUAL double exph = doublebits( 0x5fe0000000000000 - (ex >> 1)); /* 1.0d/sqrt(exponent) */ \
QUAL double y = rsqrt((QUAL float)(x*exp)); \
return __rsqrt_iterate_##QUAL##_double(x, y*exph); \
}
RSQRTD(varying)
__declspec(safe)
static inline double rsqrt(double v) {
if (__have_native_rsqrtd)
return __rsqrt_varying_double(v);
else
return __rsqrt_safe_varying_double(v);
}
RSQRTD(uniform)
__declspec(safe)
static inline uniform double rsqrt(uniform double v) {
if (__have_native_rsqrtd)
return __rsqrt_uniform_double(v);
else
return __rsqrt_safe_uniform_double(v);
}
__declspec(safe)
static inline double ldexp(double x, int n) {
unsigned int64 ex = 0x7ff0000000000000;

16
tests/test-147.ispc Normal file
View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
double x = aFOO[programIndex&0x3]*1d100;
double d, ix;
ix = rcp(x);
d = (ix - 1.0d0 / x);
d = (d < 0.0d0) ? -d : d;
RET[programIndex] = (d < 1d-15 && !isnan(d)) ? 1.0d0 : 0.0d0;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1.0d0;
}

16
tests/test-148.ispc Normal file
View File

@@ -0,0 +1,16 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
double x = aFOO[programIndex]*1d100;
double d, invsqrt = rsqrt(x);
d = (x * (invsqrt * invsqrt)) - 1.0d0;
if (d < 0.0d0) d = -d;
RET[programIndex] = d > 1d-15;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}