From 0c1b206185e13e39be330622e352edd7208fcbad Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 3 May 2012 13:46:56 -0700 Subject: [PATCH] Pass log/exp/pow transcendentals through to targets that support them. Currently, this is the generic targets. --- builtins.cpp | 8 ++- builtins/util.m4 | 7 ++ examples/intrinsics/generic-16.h | 115 +++++++++++++++++++++++++++++++ ispc.cpp | 9 +++ ispc.h | 8 +++ module.cpp | 5 ++ stdlib.ispc | 28 ++++++-- 7 files changed, 172 insertions(+), 8 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index b94fa04f..d9432ae9 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -886,10 +886,12 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); - lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module, - symbolTable); + lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, + module, symbolTable); - lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2), + lDefineConstantInt("__have_native_half", g->target.hasHalf, module, + symbolTable); + lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals, module, symbolTable); if (includeStdlibISPC) { diff --git a/builtins/util.m4 b/builtins/util.m4 index 501f2e47..042b2ef5 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1654,6 +1654,13 @@ declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, , , ) nounwind +declare float @__log_uniform_float(float) nounwind readnone +declare @__log_varying_float() nounwind readnone +declare float @__exp_uniform_float(float) nounwind readnone +declare @__exp_varying_float() nounwind readnone +declare float @__pow_uniform_float(float, float) nounwind readnone +declare @__pow_varying_float(, ) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index d6a5c121..384a9ece 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -586,6 +586,121 @@ ROTATE(__vec16_f, float, float) SHUFFLES(__vec16_f, float, float) LOAD_STORE(__vec16_f, float) +static FORCEINLINE float __exp_uniform_float(float v) { + return expf(v); +} + +static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = expf(v.v[i]); + return ret; +} + +static FORCEINLINE float __log_uniform_float(float v) { + return logf(v); +} + +static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = logf(v.v[i]); + return ret; +} + +static FORCEINLINE float __pow_uniform_float(float a, float b) { + return powf(a, b); +} + +static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = powf(a.v[i], b.v[i]); + return ret; +} + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +static FORCEINLINE float __half_to_float_uniform(int16_t h) { + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } + + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +} + + +static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) { + __vec16_f ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = __half_to_float_uniform(v.v[i]); + return ret; +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) { + uint32_t sign_mask = 0x80000000u; + int32_t o; + + int32_t fint = __intbits(f); + int32_t sign = fint & sign_mask; + fint ^= sign; + + int32_t f32infty = 255 << 23; + o = (fint > f32infty) ? 0x7e00 : 0x7c00; + + // (De)normalized number or zero + // update fint unconditionally to save the blending; we don't need it + // anymore for the Inf/NaN case anyway. + const uint32_t round_mask = ~0xfffu; + const int32_t magic = 15 << 23; + const int32_t f16infty = 31 << 23; + + int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; + fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed + + if (fint < f32infty) + o = fint2 >> 13; // Take the bits! + + return (o | (sign >> 16)); +} + + +static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) { + __vec16_i16 ret; + for (int i = 0; i < 16; ++i) + ret.v[i] = __float_to_half_uniform(v.v[i]); + return ret; +} + + /////////////////////////////////////////////////////////////////////////// // double diff --git a/ispc.cpp b/ispc.cpp index 3a2134d1..9d1220d5 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -197,6 +197,9 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->arch = arch; } + // This is the case for most of them + t->hasHalf = t->hasTranscendentals = false; + if (!strcasecmp(isa, "sse2")) { t->isa = Target::SSE2; t->nativeVectorWidth = 4; @@ -256,6 +259,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = true; t->allOffMaskIsSafe = true; t->maskBitCount = 1; + t->hasHalf = true; + t->hasTranscendentals = true; } else if (!strcasecmp(isa, "generic-32")) { t->isa = Target::GENERIC; @@ -264,6 +269,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = true; t->allOffMaskIsSafe = true; t->maskBitCount = 1; + t->hasHalf = true; + t->hasTranscendentals = true; } else if (!strcasecmp(isa, "generic-1")) { t->isa = Target::GENERIC; @@ -300,6 +307,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->allOffMaskIsSafe = false; t->maskBitCount = 32; + t->hasHalf = true; } else if (!strcasecmp(isa, "avx2-x2")) { t->isa = Target::AVX2; @@ -309,6 +317,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->allOffMaskIsSafe = false; t->maskBitCount = 32; + t->hasHalf = true; } #endif // !LLVM_3_0 else { diff --git a/ispc.h b/ispc.h index bb551a6d..e2d9294d 100644 --- a/ispc.h +++ b/ispc.h @@ -249,6 +249,14 @@ struct Target { is 32 on SSE/AVX, since that matches the HW better, but it's 1 for the generic target. */ int maskBitCount; + + /** Indicates whether the target has native support for float/half + conversions. */ + bool hasHalf; + + /** Indicates whether the target has support for transcendentals (beyond + sqrt, which we assume that all of them handle). */ + bool hasTranscendentals; }; diff --git a/module.cpp b/module.cpp index e80ac9f7..8bbb4acc 100644 --- a/module.cpp +++ b/module.cpp @@ -1333,6 +1333,11 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre else opts.addMacroDef("ISPC_POINTER_SIZE=64"); + if (g->target.hasHalf) + opts.addMacroDef("ISPC_TARGET_HAS_HALF"); + if (g->target.hasTranscendentals) + opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS"); + opts.addMacroDef("ISPC_MAJOR_VERSION=1"); opts.addMacroDef("ISPC_MINOR_VERSION=2"); diff --git a/stdlib.ispc b/stdlib.ispc index 25871616..9b2fe17d 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2915,7 +2915,10 @@ static inline uniform float atan2(uniform float y, uniform float x) { __declspec(safe) static inline float exp(float x_full) { - if (__math_lib == __math_lib_svml) { + if (__have_native_transcendentals) { + return __exp_varying_float(x_full); + } + else if (__math_lib == __math_lib_svml) { return __svml_exp(x_full); } else if (__math_lib == __math_lib_system) { @@ -2994,7 +2997,10 @@ static inline float exp(float x_full) { __declspec(safe) static inline uniform float exp(uniform float x_full) { - if (__math_lib == __math_lib_system || + if (__have_native_transcendentals) { + return __exp_uniform_float(x_full); + } + else if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { return __stdlib_expf(x_full); } @@ -3116,7 +3122,10 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo __declspec(safe) static inline float log(float x_full) { - if (__math_lib == __math_lib_svml) { + if (__have_native_transcendentals) { + return __log_varying_float(x_full); + } + else if (__math_lib == __math_lib_svml) { return __svml_log(x_full); } else if (__math_lib == __math_lib_system) { @@ -3204,7 +3213,10 @@ static inline float log(float x_full) { __declspec(safe) static inline uniform float log(uniform float x_full) { - if (__math_lib == __math_lib_system || + if (__have_native_transcendentals) { + return __log_uniform_float(x_full); + } + else if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { return __stdlib_logf(x_full); } @@ -3285,7 +3297,10 @@ static inline uniform float log(uniform float x_full) { __declspec(safe) static inline float pow(float a, float b) { - if (__math_lib == __math_lib_svml) { + if (__have_native_transcendentals) { + return __pow_varying_float(a, b); + } + else if (__math_lib == __math_lib_svml) { return __svml_pow(a, b); } else if (__math_lib == __math_lib_system) { @@ -3304,6 +3319,9 @@ static inline float pow(float a, float b) { __declspec(safe) static inline uniform float pow(uniform float a, uniform float b) { + if (__have_native_transcendentals) { + return __pow_uniform_float(a, b); + } if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { return __stdlib_powf(a, b);