From 467f1e71d7c01d6f27703978e5567e3dd9143e76 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Wed, 3 Aug 2011 15:58:42 +0100
Subject: [PATCH] Add fast versions of the float<-->half conversion routines in
 the stdlib.

These get slightly wrong results for zero and the denorms and also
don't handle the Inf/NaN stuff correctly, but are much more efficient
than the full versions of these routines.
---
 stdlib.ispc       | 74 +++++++++++++++++++++++++++++++++++++++++++++++
 tests/half-2.ispc | 32 ++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 tests/half-2.ispc

diff --git a/stdlib.ispc b/stdlib.ispc
index 9e63c566..bbd9515b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2600,6 +2600,80 @@ static inline int16 float_to_half(float f) {
 }
 
 
+static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
+    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline float half_to_float_fast(unsigned int16 h) {
+    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+
+    // sign
+    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+    // Exponent: unbias the halfp, then bias the single
+    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+    // Exponent
+    unsigned int32 xe = (unsigned int32) (xes << 23);
+    // Mantissa
+    unsigned int32 xm = ((unsigned int32) hm) << 13; 
+    return floatbits(xs | xe | xm);
+
+}
+
+static inline uniform int16 float_to_half_fast(uniform float f) {
+    uniform int32 x = intbits(f);
+    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    uniform unsigned int32 he = (hes << 10); // Exponent
+    uniform int32 hm = (xm >> 13); // Mantissa
+    uniform int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
+static inline int16 float_to_half_fast(float f) {
+    int32 x = intbits(f);
+    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+
+    unsigned int32 hs = (xs >> 16); // Sign bit
+    // Exponent unbias the single, then bias the halfp
+    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+    unsigned int32 he = (hes << 10); // Exponent
+    int32 hm = (xm >> 13); // Mantissa
+    int32 ret = (hs | he | hm);
+
+    if (xm & 0x00001000u) // Check for rounding
+        // Round, might overflow to inf, this is OK
+        ret += 1u; 
+
+    return (int16)ret;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
 
diff --git a/tests/half-2.ispc b/tests/half-2.ispc
new file mode 100644
index 00000000..991c0947
--- /dev/null
+++ b/tests/half-2.ispc
@@ -0,0 +1,32 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+    float sum = 0;
+    int errors = 0;
+
+    // We should match up except for the denorms (0->1023) and the inf/nan
+    // ranges.
+    for (uniform int i = 1024; i < 32768-1024; ++i) {
+        unsigned int16 h = i;
+        float f = half_to_float_fast(i);
+        h = float_to_half_fast(f);
+
+        if (i != h)
+            ++errors;
+    }
+
+    for (uniform int i = 32768+1024; i < 65536-1024; ++i) {
+        unsigned int16 h = i;
+        float f = half_to_float_fast(i);
+        h = float_to_half_fast(f);
+
+        if (i != h)
+            ++errors;
+    }
+    RET[programIndex] = errors;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}