From 88cd5584e85cffd606cf7adf7f6ad7dcfa942a35 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 22 Mar 2012 13:27:26 -0700
Subject: [PATCH 1/2] Add Debug() statement to report on if stmt cost/safety
 test results.

---
 stmt.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/stmt.cpp b/stmt.cpp
index d9018f02..9aad4291 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -541,6 +541,10 @@ IfStmt::emitVaryingIf(FunctionEmitContext *ctx, llvm::Value *ltest) const {
         bool safeToRunWithAllLanesOff = (SafeToRunWithMaskAllOff(trueStmts) &&
                                          SafeToRunWithMaskAllOff(falseStmts));
 
+        Debug(pos, "If statement: true cost %d (safe %d), false cost %d (safe %d).",
+              ::EstimateCost(trueStmts), (int)SafeToRunWithMaskAllOff(trueStmts),
+              ::EstimateCost(falseStmts), (int)SafeToRunWithMaskAllOff(falseStmts));
+              
         if (safeToRunWithAllLanesOff &&
             (costIsAcceptable || g->opt.disableCoherentControlFlow)) {
             ctx->StartVaryingIf(oldMask);

From 3bb2dee275c232c54b27bdb102d6cd2dff830a26 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 22 Mar 2012 13:27:56 -0700
Subject: [PATCH 2/2] Update float_to_half() with more efficient version from
 @rygorous

---
 stdlib.ispc | 250 +++++++++++++++++++++-------------------------------
 1 file changed, 102 insertions(+), 148 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 20f7eac5..f5984277 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -3460,68 +3460,42 @@ static inline uniform int16 float_to_half(uniform float f) {
         return __float_to_half_uniform(f);
     }
     else {
-        uniform int32 x = intbits(f);
-        // Store the return value in an int32 until the very end; this ends up
-        // generating better code...
-        uniform int32 ret;
-        if ((x & 0x7FFFFFFFu) == 0)
-            // Signed zero
-            ret = (x >> 16); 
-        else {
-            uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-            uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-            uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-            if (xe == 0) {  
-                // Denormal will underflow, return a signed zero
-                ret = (xs >> 16);
-            } 
-            else {
-                if (xe == 0x7F800000u) {  
-                    // Inf or NaN (all the exponent bits are set)
-                    if (xm == 0)
-                        // Zero mantissa -> signed infinity
-                        ret = ((xs >> 16) | 0x7C00u);
-                    else
-                        // NaN, only 1st mantissa bit set
-                        ret = 0xFE00u;
-                }
-                else { 
-                    // Normalized number
-                    uniform unsigned int32 hs = (xs >> 16); // Sign bit
-                    uniform unsigned int32 hm;
-                    // Exponent unbias the single, then bias the halfp
-                    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                    if (hes >= 0x1F)  
-                        // Overflow: return signed infinity
-                        ret = ((xs >> 16) | 0x7C00u);
-                    else if (hes <= 0) {
-                        // Underflow
-                        if ((14 - hes) > 24) {  
-                            // Mantissa shifted all the way off & no rounding possibility
-                            hm = 0u;  // Set mantissa to zero
-                        } 
-                        else {
-                            xm |= 0x00800000u;  // Add the hidden leading bit
-                            hm = (xm >> (14 - hes)); // Mantissa
-                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                                // Round, might overflow into exp bit, but this is OK
-                                hm += 1u; 
-                        }
-                        ret = (hs | hm);
-                    } 
-                    else {
-                        uniform unsigned int32 he = (hes << 10); // Exponent
-                        hm = (xm >> 13); // Mantissa
-                        if (xm & 0x00001000u) // Check for rounding
-                            // Round, might overflow to inf, this is OK
-                            ret = (hs | he | hm) + 1u; 
-                        else
-                            ret = (hs | he | hm);
-                    }
-                }
-            }
-        }
-        return (int16)ret;
+        // via Fabian "ryg" Giesen.
+        // https://gist.github.com/2156668
+        uniform unsigned int32 sign_mask = 0x80000000u;
+        uniform int32 o;
+
+        uniform int32 fint = intbits(f);
+        uniform int32 sign = fint & sign_mask;
+        fint ^= sign;
+
+        // NOTE all the integer compares in this function can be safely
+        // compiled into signed compares since all operands are below
+        // 0x80000000. Important if you want fast straight SSE2 code (since
+        // there's no unsigned PCMPGTD).
+
+        // Inf or NaN (all exponent bits set)
+        // NaN->qNaN and Inf->Inf
+        // unconditional assignment here, will override with right value for
+        // the regular case below.
+        uniform int32 f32infty = 255 << 23;
+        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+        // (De)normalized number or zero
+        // update fint unconditionally to save the blending; we don't need it
+        // anymore for the Inf/NaN case anyway.
+
+        const uniform unsigned int32 round_mask = ~0xfffu; 
+        const uniform int32 magic = 15 << 23;
+        const uniform int32 f16infty = 31 << 23;
+
+        uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
+        fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+        if (fint < f32infty)
+            o = fint2 >> 13; // Take the bits!
+
+        return (o | (sign >> 16));
     }
 }
 
@@ -3532,68 +3506,58 @@ static inline int16 float_to_half(float f) {
         return __float_to_half_varying(f);
     }
     else {
-        int32 x = intbits(f);
-        // Store the return value in an int32 until the very end; this ends up
-        // generating better code...
-        int32 ret;
-        if ((x & 0x7FFFFFFFu) == 0)
-            // Signed zero
-            ret = (x >> 16); 
-        else {
-            unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-            unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-            unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-            if (xe == 0) {  
-                // Denormal will underflow, return a signed zero
-                ret = (xs >> 16);
-            } 
-            else {
-                cif (xe == 0x7F800000u) {  
-                    // Inf or NaN (all the exponent bits are set)
-                    if (xm == 0)
-                        // Zero mantissa -> signed infinity
-                        ret = ((xs >> 16) | 0x7C00u);
-                    else
-                        // NaN, only 1st mantissa bit set
-                        ret = 0xFE00u;
-                }
-                else { 
-                    // Normalized number
-                    unsigned int32 hs = (xs >> 16); // Sign bit
-                    unsigned int32 hm;
-                    // Exponent unbias the single, then bias the halfp
-                    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                    if (hes >= 0x1F)  
-                        // Overflow: return signed infinity
-                        ret = ((xs >> 16) | 0x7C00u);
-                    else if (hes <= 0) {
-                        // Underflow
-                        if ((14 - hes) > 24) {  
-                            // Mantissa shifted all the way off & no rounding possibility
-                            hm = 0u;  // Set mantissa to zero
-                        } 
-                        else {
-                            xm |= 0x00800000u;  // Add the hidden leading bit
-                            hm = (xm >> (14 - hes)); // Mantissa
-                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                                // Round, might overflow into exp bit, but this is OK
-                                hm += 1u; 
-                        }
-                        ret = (hs | hm);
-                    } 
-                    else {
-                        unsigned int32 he = (hes << 10); // Exponent
-                        hm = (xm >> 13); // Mantissa
-                        if (xm & 0x00001000u) // Check for rounding
-                            // Round, might overflow to inf, this is OK
-                            ret = (hs | he | hm) + 1u; 
-                        else
-                            ret = (hs | he | hm);
-                    }
-                }
-            }
-        }
-        return (int16)ret;
+        // via Fabian "ryg" Giesen.
+        // https://gist.github.com/2156668
+        unsigned int32 sign_mask = 0x80000000u;
+        int32 o;
+
+        int32 fint = intbits(f);
+        int32 sign = fint & sign_mask;
+        fint ^= sign;
+
+        // NOTE all the integer compares in this function can be safely
+        // compiled into signed compares since all operands are below
+        // 0x80000000. Important if you want fast straight SSE2 code (since
+        // there's no unsigned PCMPGTD).
+
+        // Inf or NaN (all exponent bits set)
+        // NaN->qNaN and Inf->Inf
+        // unconditional assignment here, will override with right value for
+        // the regular case below.
+        int32 f32infty = 255 << 23;
+        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+
+        // (De)normalized number or zero
+        // update fint unconditionally to save the blending; we don't need it
+        // anymore for the Inf/NaN case anyway.
+
+        const unsigned int32 round_mask = ~0xfffu; 
+        const int32 magic = 15 << 23;
+        const int32 f16infty = 31 << 23;
+
+        // Shift exponent down, denormalize if necessary.
+        // NOTE This represents half-float denormals using single precision denormals.
+        // The main reason to do this is that there's no shift with per-lane variable
+        // shifts in SSE*, which we'd otherwise need. It has some funky side effects
+        // though:
+        // - This conversion will actually respect the FTZ (Flush To Zero) flag in
+        //   MXCSR - if it's set, no half-float denormals will be generated. I'm
+        //   honestly not sure whether this is good or bad. It's definitely interesting.
+        // - If the underlying HW doesn't support denormals (not an issue with Intel
+        //   CPUs, but might be a problem on GPUs or PS3 SPUs), you will always get
+        //   flush-to-zero behavior. This is bad, unless you're on a CPU where you don't
+        //   care.
+        // - Denormals tend to be slow. FP32 denormals are rare in practice outside of things
+        //   like recursive filters in DSP - not a typical half-float application. Whether
+        //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
+        //   may or may not have for denormals, this may well hit it.
+        int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
+        fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
+
+        if (fint < f32infty)
+            o = fint2 >> 13; // Take the bits!
+
+        return (o | (sign >> 16));
     }
 }
 
@@ -3604,19 +3568,15 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
         return __half_to_float_uniform(h);
     }
     else {
-        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+        uniform unsigned int32 hs = h & (int32)0x8000u;     // Pick off sign bit
+        uniform unsigned int32 hem = h & (int32)0x7fffu;    // Pick off exponent-mantissa bits
 
-        // sign
-        uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
-        // Exponent: unbias the halfp, then bias the single
-        uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-        // Exponent
-        uniform unsigned int32 xe = (unsigned int32) (xes << 23);
-        // Mantissa
-        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
-        return floatbits(xs | xe | xm);
+        uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
+        uniform unsigned int32 xem = ((unsigned int32) hem) << 13;
+
+        xem += 0x38000000;      // (127 - 15) << 23
+
+        return floatbits(xs | xem);
     }
 }
 
@@ -3626,19 +3586,13 @@ static inline float half_to_float_fast(unsigned int16 h) {
         return __half_to_float_varying(h);
     }
     else {
-        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+        unsigned int32 hs = h & (int32)0x8000u;     // Pick off sign bit
+        unsigned int32 hem = h & (int32)0x7fffu;    // Pick off exponent-mantissa bits
 
-        // sign
-        unsigned int32 xs = ((unsigned int32) hs) << 16; 
-        // Exponent: unbias the halfp, then bias the single
-        int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-        // Exponent
-        unsigned int32 xe = (unsigned int32) (xes << 23);
-        // Mantissa
-        unsigned int32 xm = ((unsigned int32) hm) << 13; 
-        return floatbits(xs | xe | xm);
+        unsigned int32 xs = ((unsigned int32) hs) << 16;
+        unsigned int32 xem = ((unsigned int32) hem) << 13;
+
+        return floatbits(xs | (xem + 0x38000000 /* (127 - 15) << 23 */));
     }
 }