From 0c1b206185e13e39be330622e352edd7208fcbad Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 3 May 2012 13:46:56 -0700
Subject: [PATCH] Pass log/exp/pow transcendentals through to targets that
 support them.

Currently, this is the generic targets.
---
 builtins.cpp                     |   8 ++-
 builtins/util.m4                 |   7 ++
 examples/intrinsics/generic-16.h | 115 +++++++++++++++++++++++++++++++
 ispc.cpp                         |   9 +++
 ispc.h                           |   8 +++
 module.cpp                       |   5 ++
 stdlib.ispc                      |  28 ++++++--
 7 files changed, 172 insertions(+), 8 deletions(-)
diff --git a/builtins.cpp b/builtins.cpp
index b94fa04f..d9432ae9 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -886,10 +886,12 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
-    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
-                           symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
+                           module, symbolTable);
 
-    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+    lDefineConstantInt("__have_native_half", g->target.hasHalf, module, 
+                       symbolTable);
+    lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
                        module, symbolTable);
 
     if (includeStdlibISPC) {
diff --git a/builtins/util.m4 b/builtins/util.m4
index 501f2e47..042b2ef5 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1654,6 +1654,13 @@ declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>,
 declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
                                                  <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
+declare float @__log_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__exp_uniform_float(float) nounwind readnone
+declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
+declare float @__pow_uniform_float(float, float) nounwind readnone
+declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index d6a5c121..384a9ece 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -586,6 +586,121 @@ ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
 LOAD_STORE(__vec16_f, float)
 
+static FORCEINLINE float __exp_uniform_float(float v) {
+    return expf(v);
+}
+
+static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = expf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __log_uniform_float(float v) {
+    return logf(v);
+}
+
+static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = logf(v.v[i]);
+    return ret;
+}
+
+static FORCEINLINE float __pow_uniform_float(float a, float b) {
+    return powf(a, b);
+}
+
+static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = powf(a.v[i], b.v[i]);
+    return ret;
+}
+
+static FORCEINLINE int __intbits(float v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.f = v;
+    return u.i;
+}
+
+static FORCEINLINE float __floatbits(int v) {
+    union {
+        float f;
+        int i;
+    } u;
+    u.i = v;
+    return u.f;
+}
+
+static FORCEINLINE float __half_to_float_uniform(int16_t h) {
+    static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+    uint32_t exp = shifted_exp & o;   // just the exponent
+    o += (127 - 15) << 23;        // exponent adjust
+
+    // handle exponent special cases
+    if (exp == shifted_exp) // Inf/NaN?
+        o += (128 - 16) << 23;    // extra exp adjust
+    else if (exp == 0) { // Zero/Denormal?
+        o += 1 << 23;             // extra exp adjust
+        o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
+    }
+
+    o |= ((int32_t)(h & 0x8000)) << 16;    // sign bit
+    return __floatbits(o);
+}
+
+
+static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
+    __vec16_f ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __half_to_float_uniform(v.v[i]);
+    return ret;
+}
+
+
+static FORCEINLINE int16_t __float_to_half_uniform(float f) {
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    int32_t fint = __intbits(f);
+    int32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    int32_t f32infty = 255 << 23;
+    o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+    const uint32_t round_mask = ~0xfffu; 
+    const int32_t magic = 15 << 23;
+    const int32_t f16infty = 31 << 23;
+
+    int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
+    fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+
+static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
+    __vec16_i16 ret;
+    for (int i = 0; i < 16; ++i)
+        ret.v[i] = __float_to_half_uniform(v.v[i]);
+    return ret;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // double
 
diff --git a/ispc.cpp b/ispc.cpp
index 3a2134d1..9d1220d5 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -197,6 +197,9 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->arch = arch;
     }
 
+    // This is the case for most of them
+    t->hasHalf = t->hasTranscendentals = false;
+
     if (!strcasecmp(isa, "sse2")) {
         t->isa = Target::SSE2;
         t->nativeVectorWidth = 4;
@@ -256,6 +259,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = true;
         t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
     }
     else if (!strcasecmp(isa, "generic-32")) {
         t->isa = Target::GENERIC;
@@ -264,6 +269,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = true;
         t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
+        t->hasHalf = true;
+        t->hasTranscendentals = true;
     }
     else if (!strcasecmp(isa, "generic-1")) {
         t->isa = Target::GENERIC;
@@ -300,6 +307,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = false;
         t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
+        t->hasHalf = true;
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         t->isa = Target::AVX2;
@@ -309,6 +317,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->maskingIsFree = false;
         t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
+        t->hasHalf = true;
     }
 #endif // !LLVM_3_0
     else {
diff --git a/ispc.h b/ispc.h
index bb551a6d..e2d9294d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -249,6 +249,14 @@ struct Target {
         is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
         the generic target. */
     int maskBitCount;
+
+    /** Indicates whether the target has native support for float/half
+        conversions. */
+    bool hasHalf;
+
+    /** Indicates whether the target has support for transcendentals (beyond
+        sqrt, which we assume that all of them handle). */
+    bool hasTranscendentals;
 };
 
 
diff --git a/module.cpp b/module.cpp
index e80ac9f7..8bbb4acc 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1333,6 +1333,11 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     else
         opts.addMacroDef("ISPC_POINTER_SIZE=64");
 
+    if (g->target.hasHalf)
+        opts.addMacroDef("ISPC_TARGET_HAS_HALF");
+    if (g->target.hasTranscendentals)
+        opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
+
     opts.addMacroDef("ISPC_MAJOR_VERSION=1");
     opts.addMacroDef("ISPC_MINOR_VERSION=2");
 
diff --git a/stdlib.ispc b/stdlib.ispc
index 25871616..9b2fe17d 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2915,7 +2915,10 @@ static inline uniform float atan2(uniform float y, uniform float x) {
 
 __declspec(safe)
 static inline float exp(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __exp_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_exp(x_full);
     }
     else if (__math_lib == __math_lib_system) {
@@ -2994,7 +2997,10 @@ static inline float exp(float x_full) {
 
 __declspec(safe)
 static inline uniform float exp(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __exp_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_expf(x_full);
     }
@@ -3116,7 +3122,10 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
 
 __declspec(safe)
 static inline float log(float x_full) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __log_varying_float(x_full);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_log(x_full);
     }
     else if (__math_lib == __math_lib_system) {
@@ -3204,7 +3213,10 @@ static inline float log(float x_full) {
 
 __declspec(safe)
 static inline uniform float log(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
+    if (__have_native_transcendentals) {
+        return __log_uniform_float(x_full);
+    }
+    else if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_logf(x_full);
     }
@@ -3285,7 +3297,10 @@ static inline uniform float log(uniform float x_full) {
 
 __declspec(safe)
 static inline float pow(float a, float b) {
-    if (__math_lib == __math_lib_svml) {
+    if (__have_native_transcendentals) {
+        return __pow_varying_float(a, b);
+    }
+    else if (__math_lib == __math_lib_svml) {
         return __svml_pow(a, b);
     }
     else if (__math_lib == __math_lib_system) {
@@ -3304,6 +3319,9 @@ static inline float pow(float a, float b) {
 
 __declspec(safe)
 static inline uniform float pow(uniform float a, uniform float b) {
+    if (__have_native_transcendentals) {
+        return __pow_uniform_float(a, b);
+    }
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         return __stdlib_powf(a, b);