From 1867b5b317a099d02bdf39cc3ff669affbe83e87 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 24 Jan 2012 15:33:38 -0800
Subject: [PATCH] Use native float/half conversion instructions with the AVX2
 target.

---
 builtins.cpp               |   3 +
 builtins/target-avx1-x2.ll |   8 +
 builtins/target-avx1.ll    |  11 +-
 builtins/target-avx2-x2.ll |  55 ++++
 builtins/target-avx2.ll    |  41 ++-
 builtins/target-sse2-x2.ll |   8 +
 builtins/target-sse2.ll    |   8 +
 builtins/target-sse4-x2.ll |   8 +
 builtins/target-sse4.ll    |   8 +
 ispc.cpp                   |   4 +-
 stdlib.ispc                | 534 ++++++++++++++++++++-----------------
 tests/half-3.ispc          |  21 ++
 12 files changed, 453 insertions(+), 256 deletions(-)
 create mode 100644 tests/half-3.ispc
diff --git a/builtins.cpp b/builtins.cpp
index 2608e031..76ebdfa7 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -822,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                            symbolTable);
 
+    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
+                       module, symbolTable);
+
     if (includeStdlibISPC) {
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll
index d05da95f..36f47cec 100644
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -58,6 +58,14 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
   ret <16 x i32> %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll
index 137ddf00..e46fc3b4 100644
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -58,6 +58,14 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
   ret <8 x i32> %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
@@ -65,6 +73,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
-
-
-
diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll
index fa4f345f..e4d3f686 100644
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -63,6 +63,61 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
   ret <16 x i32> %m
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
+  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
+  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
+  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
+             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
+  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index c812ede1..66b2a23e 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -63,6 +63,44 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
   ret <8 x i32> %m
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float/half conversions
+
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
+; 0 is round nearest even
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
+  ret <8 x i16> %r
+}
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
+  %r = extractelement <8 x float> %rv, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
+           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  ; round to nearest even
+  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
+  %r = extractelement <8 x i16> %rv, i32 0
+  ret i16 %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 
@@ -70,6 +108,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
-
-
-
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index c0030f31..2e6d1bdc 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 8d9911d8..21ffb267 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,14 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index b7cd36ec..5a467ec2 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 68ff49d9..9dfe9db7 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,14 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/ispc.cpp b/ispc.cpp
index 523927fc..7fbc5bc6 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->isa = Target::AVX2;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
         t->maskingIsFree = false;
         t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->isa = Target::AVX2;
         t->nativeVectorWidth = 16;
         t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
         t->maskingIsFree = false;
         t->allOffMaskIsSafe = false;
         t->maskBitCount = 32;
diff --git a/stdlib.ispc b/stdlib.ispc
index 8a7daf49..6cb7e732 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2824,114 +2824,124 @@ static inline uniform double pow(uniform double a, uniform double b) {
 // half-precision floats
 
 static inline uniform float half_to_float(uniform unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
-        // Signed zero
-        return floatbits(((unsigned int32) h) << 16);
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
     else {
-        // Though these are int16 quantities, we get much better code 
-        // with them stored as int32s...
-        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-        if (he == 0) {  
-            // Denormal will convert to normalized
-            uniform int e = -1;
-            // The following loop figures out how much extra to adjust the exponent
-            // Shift until leading bit overflows into exponent bit
-            do {
-                e++;
-                hm <<= 1;
-            } while((hm & 0x0400u) == 0);
-
-            // Sign bit
-            uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
-            // Exponent: unbias the halfp, then bias the single
-            uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
-            // Exponent
-            uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
-            // Mantissa
-            uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
-            return floatbits(xs | xe | xm);
-        } 
+        if ((h & 0x7FFFu) == 0) 
+            // Signed zero
+            return floatbits(((unsigned int32) h) << 16);
         else {
-            if (he == 0x7C00u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (hm == 0)
-                    // Zero mantissa -> signed inf
-                    return floatbits((((unsigned int32) hs) << 16) | 
-                                     ((unsigned int32) 0x7F800000u));
-                else
-                    // NaN
-                    return floatbits(0xFFC00000u);
-            }
-            else { 
-                // Normalized number
-                // sign
+            // Though these are int16 quantities, we get much better code 
+            // with them stored as int32s...
+            uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+            uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+            if (he == 0) {  
+                // Denormal will convert to normalized
+                uniform int e = -1;
+                // The following loop figures out how much extra to adjust the exponent
+                // Shift until leading bit overflows into exponent bit
+                do {
+                    e++;
+                    hm <<= 1;
+                } while((hm & 0x0400u) == 0);
+
+                // Sign bit
                 uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
                 // Exponent: unbias the halfp, then bias the single
-                uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                 // Exponent
-                uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+                uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
                 // Mantissa
-                uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                 return floatbits(xs | xe | xm);
+            } 
+            else {
+                if (he == 0x7C00u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (hm == 0)
+                        // Zero mantissa -> signed inf
+                        return floatbits((((unsigned int32) hs) << 16) | 
+                                         ((unsigned int32) 0x7F800000u));
+                    else
+                        // NaN
+                        return floatbits(0xFFC00000u);
+                }
+                else { 
+                    // Normalized number
+                    // sign
+                    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+                    // Exponent: unbias the halfp, then bias the single
+                    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                    // Exponent
+                    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+                    // Mantissa
+                    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                    return floatbits(xs | xe | xm);
+                }
             }
         }
     }
 }
 
 static inline float half_to_float(unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
-        // Signed zero
-        return floatbits(((unsigned int32) h) << 16);
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
     else {
-        // Though these are int16 quantities, we get much better code 
-        // with them stored as int32s...
-        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-        cif (he == 0) {  
-            // Denormal will convert to normalized
-            int e = -1;
-            // The following loop figures out how much extra to adjust the exponent
-            // Shift until leading bit overflows into exponent bit
-            do {
-                e++;
-                hm <<= 1;
-            } while((hm & 0x0400u) == 0);
-
-            // Sign bit
-            unsigned int32 xs = ((unsigned int32) hs) << 16; 
-            // Exponent: unbias the halfp, then bias the single
-            int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
-            // Exponent
-            unsigned int32 xe = (unsigned int32) (xes << 23); 
-            // Mantissa
-            unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
-            return floatbits(xs | xe | xm);
-        } 
+        if ((h & 0x7FFFu) == 0) 
+            // Signed zero
+            return floatbits(((unsigned int32) h) << 16);
         else {
-            if (he == 0x7C00u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (hm == 0)
-                    // Zero mantissa -> signed inf
-                    return floatbits((((unsigned int32) hs) << 16) | 
-                                     ((unsigned int32) 0x7F800000u));
-                else
-                    // NaN
-                    return floatbits(0xFFC00000u);
-            }
-            else { 
-                // Normalized number
-                // sign
+            // Though these are int16 quantities, we get much better code 
+            // with them stored as int32s...
+            unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+            unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+            cif (he == 0) {  
+                // Denormal will convert to normalized
+                int e = -1;
+                // The following loop figures out how much extra to adjust the exponent
+                // Shift until leading bit overflows into exponent bit
+                do {
+                    e++;
+                    hm <<= 1;
+                } while((hm & 0x0400u) == 0);
+
+                // Sign bit
                 unsigned int32 xs = ((unsigned int32) hs) << 16; 
                 // Exponent: unbias the halfp, then bias the single
-                int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                 // Exponent
-                unsigned int32 xe = (unsigned int32) (xes << 23);
+                unsigned int32 xe = (unsigned int32) (xes << 23); 
                 // Mantissa
-                unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                 return floatbits(xs | xe | xm);
+            } 
+            else {
+                if (he == 0x7C00u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (hm == 0)
+                        // Zero mantissa -> signed inf
+                        return floatbits((((unsigned int32) hs) << 16) | 
+                                         ((unsigned int32) 0x7F800000u));
+                    else
+                        // NaN
+                        return floatbits(0xFFC00000u);
+                }
+                else { 
+                    // Normalized number
+                    // sign
+                    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+                    // Exponent: unbias the halfp, then bias the single
+                    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                    // Exponent
+                    unsigned int32 xe = (unsigned int32) (xes << 23);
+                    // Mantissa
+                    unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                    return floatbits(xs | xe | xm);
+                }
             }
         }
     }
@@ -2939,209 +2949,237 @@ static inline float half_to_float(unsigned int16 h) {
 
 
 static inline uniform int16 float_to_half(uniform float f) {
-    uniform int32 x = intbits(f);
-    // Store the return value in an int32 until the very end; this ends up
-    // generating better code...
-    uniform int32 ret;
-    if ((x & 0x7FFFFFFFu) == 0)
-        // Signed zero
-        ret = (x >> 16); 
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
     else {
-        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-        if (xe == 0) {  
-            // Denormal will underflow, return a signed zero
-            ret = (xs >> 16);
-        } 
+        uniform int32 x = intbits(f);
+        // Store the return value in an int32 until the very end; this ends up
+        // generating better code...
+        uniform int32 ret;
+        if ((x & 0x7FFFFFFFu) == 0)
+            // Signed zero
+            ret = (x >> 16); 
         else {
-            if (xe == 0x7F800000u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (xm == 0)
-                    // Zero mantissa -> signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else
-                    // NaN, only 1st mantissa bit set
-                    ret = 0xFE00u;
-            }
-            else { 
-                // Normalized number
-                uniform unsigned int32 hs = (xs >> 16); // Sign bit
-                uniform unsigned int32 hm;
-                // Exponent unbias the single, then bias the halfp
-                uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                if (hes >= 0x1F)  
-                    // Overflow: return signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else if (hes <= 0) {
-                    // Underflow
-                    if ((14 - hes) > 24) {  
-                        // Mantissa shifted all the way off & no rounding possibility
-                        hm = 0u;  // Set mantissa to zero
+            uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+            uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+            uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+            if (xe == 0) {  
+                // Denormal will underflow, return a signed zero
+                ret = (xs >> 16);
+            } 
+            else {
+                if (xe == 0x7F800000u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (xm == 0)
+                        // Zero mantissa -> signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else
+                        // NaN, only 1st mantissa bit set
+                        ret = 0xFE00u;
+                }
+                else { 
+                    // Normalized number
+                    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+                    uniform unsigned int32 hm;
+                    // Exponent unbias the single, then bias the halfp
+                    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                    if (hes >= 0x1F)  
+                        // Overflow: return signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else if (hes <= 0) {
+                        // Underflow
+                        if ((14 - hes) > 24) {  
+                            // Mantissa shifted all the way off & no rounding possibility
+                            hm = 0u;  // Set mantissa to zero
+                        } 
+                        else {
+                            xm |= 0x00800000u;  // Add the hidden leading bit
+                            hm = (xm >> (14 - hes)); // Mantissa
+                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                                // Round, might overflow into exp bit, but this is OK
+                                hm += 1u; 
+                        }
+                        ret = (hs | hm);
                     } 
                     else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
-                        hm = (xm >> (14 - hes)); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
-                            hm += 1u; 
+                        uniform unsigned int32 he = (hes << 10); // Exponent
+                        hm = (xm >> 13); // Mantissa
+                        if (xm & 0x00001000u) // Check for rounding
+                            // Round, might overflow to inf, this is OK
+                            ret = (hs | he | hm) + 1u; 
+                        else
+                            ret = (hs | he | hm);
                     }
-                    ret = (hs | hm);
-                } 
-                else {
-                    uniform unsigned int32 he = (hes << 10); // Exponent
-                    hm = (xm >> 13); // Mantissa
-                    if (xm & 0x00001000u) // Check for rounding
-                        // Round, might overflow to inf, this is OK
-                        ret = (hs | he | hm) + 1u; 
-                    else
-                        ret = (hs | he | hm);
                 }
             }
         }
+        return (int16)ret;
     }
-    return (int16)ret;
 }
 
 
 static inline int16 float_to_half(float f) {
-    int32 x = intbits(f);
-    // Store the return value in an int32 until the very end; this ends up
-    // generating better code...
-    int32 ret;
-    if ((x & 0x7FFFFFFFu) == 0)
-        // Signed zero
-        ret = (x >> 16); 
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
     else {
-        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-        if (xe == 0) {  
-            // Denormal will underflow, return a signed zero
-            ret = (xs >> 16);
-        } 
+        int32 x = intbits(f);
+        // Store the return value in an int32 until the very end; this ends up
+        // generating better code...
+        int32 ret;
+        if ((x & 0x7FFFFFFFu) == 0)
+            // Signed zero
+            ret = (x >> 16); 
         else {
-            cif (xe == 0x7F800000u) {  
-                // Inf or NaN (all the exponent bits are set)
-                if (xm == 0)
-                    // Zero mantissa -> signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else
-                    // NaN, only 1st mantissa bit set
-                    ret = 0xFE00u;
-            }
-            else { 
-                // Normalized number
-                unsigned int32 hs = (xs >> 16); // Sign bit
-                unsigned int32 hm;
-                // Exponent unbias the single, then bias the halfp
-                int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-                if (hes >= 0x1F)  
-                    // Overflow: return signed infinity
-                    ret = ((xs >> 16) | 0x7C00u);
-                else if (hes <= 0) {
-                    // Underflow
-                    if ((14 - hes) > 24) {  
-                        // Mantissa shifted all the way off & no rounding possibility
-                        hm = 0u;  // Set mantissa to zero
+            unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+            unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+            unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+            if (xe == 0) {  
+                // Denormal will underflow, return a signed zero
+                ret = (xs >> 16);
+            } 
+            else {
+                cif (xe == 0x7F800000u) {  
+                    // Inf or NaN (all the exponent bits are set)
+                    if (xm == 0)
+                        // Zero mantissa -> signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else
+                        // NaN, only 1st mantissa bit set
+                        ret = 0xFE00u;
+                }
+                else { 
+                    // Normalized number
+                    unsigned int32 hs = (xs >> 16); // Sign bit
+                    unsigned int32 hm;
+                    // Exponent unbias the single, then bias the halfp
+                    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                    if (hes >= 0x1F)  
+                        // Overflow: return signed infinity
+                        ret = ((xs >> 16) | 0x7C00u);
+                    else if (hes <= 0) {
+                        // Underflow
+                        if ((14 - hes) > 24) {  
+                            // Mantissa shifted all the way off & no rounding possibility
+                            hm = 0u;  // Set mantissa to zero
+                        } 
+                        else {
+                            xm |= 0x00800000u;  // Add the hidden leading bit
+                            hm = (xm >> (14 - hes)); // Mantissa
+                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                                // Round, might overflow into exp bit, but this is OK
+                                hm += 1u; 
+                        }
+                        ret = (hs | hm);
                     } 
                     else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
-                        hm = (xm >> (14 - hes)); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
-                            hm += 1u; 
+                        unsigned int32 he = (hes << 10); // Exponent
+                        hm = (xm >> 13); // Mantissa
+                        if (xm & 0x00001000u) // Check for rounding
+                            // Round, might overflow to inf, this is OK
+                            ret = (hs | he | hm) + 1u; 
+                        else
+                            ret = (hs | he | hm);
                     }
-                    ret = (hs | hm);
-                } 
-                else {
-                    unsigned int32 he = (hes << 10); // Exponent
-                    hm = (xm >> 13); // Mantissa
-                    if (xm & 0x00001000u) // Check for rounding
-                        // Round, might overflow to inf, this is OK
-                        ret = (hs | he | hm) + 1u; 
-                    else
-                        ret = (hs | he | hm);
                 }
             }
         }
+        return (int16)ret;
     }
-    return (int16)ret;
 }
 
 
 static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
-    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-
-    // sign
-    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
-    // Exponent: unbias the halfp, then bias the single
-    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-    // Exponent
-    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
-    // Mantissa
-    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
-    return floatbits(xs | xe | xm);
+    if (__have_native_half) {
+        return __half_to_float_uniform(h);
+    }
+    else {
+        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
 
+        // sign
+        uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        // Exponent: unbias the halfp, then bias the single
+        uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+        // Exponent
+        uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+        // Mantissa
+        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+        return floatbits(xs | xe | xm);
+    }
 }
 
 static inline float half_to_float_fast(unsigned int16 h) {
-    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-
-    // sign
-    unsigned int32 xs = ((unsigned int32) hs) << 16; 
-    // Exponent: unbias the halfp, then bias the single
-    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
-    // Exponent
-    unsigned int32 xe = (unsigned int32) (xes << 23);
-    // Mantissa
-    unsigned int32 xm = ((unsigned int32) hm) << 13; 
-    return floatbits(xs | xe | xm);
+    if (__have_native_half) {
+        return __half_to_float_varying(h);
+    }
+    else {
+        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
 
+        // sign
+        unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        // Exponent: unbias the halfp, then bias the single
+        int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+        // Exponent
+        unsigned int32 xe = (unsigned int32) (xes << 23);
+        // Mantissa
+        unsigned int32 xm = ((unsigned int32) hm) << 13; 
+        return floatbits(xs | xe | xm);
+    }
 }
 
 static inline uniform int16 float_to_half_fast(uniform float f) {
-    uniform int32 x = intbits(f);
-    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    if (__have_native_half) {
+        return __float_to_half_uniform(f);
+    }
+    else {
+        uniform int32 x = intbits(f);
+        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
 
-    uniform unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
-    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    uniform unsigned int32 he = (hes << 10); // Exponent
-    uniform int32 hm = (xm >> 13); // Mantissa
-    uniform int32 ret = (hs | he | hm);
+        uniform unsigned int32 hs = (xs >> 16); // Sign bit
+        // Exponent unbias the single, then bias the halfp
+        uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        uniform unsigned int32 he = (hes << 10); // Exponent
+        uniform int32 hm = (xm >> 13); // Mantissa
+        uniform int32 ret = (hs | he | hm);
 
-    if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
-        ret += 1u; 
+        if (xm & 0x00001000u) // Check for rounding
+            // Round, might overflow to inf, this is OK
+            ret += 1u; 
 
-    return (int16)ret;
+        return (int16)ret;
+    }
 }
 
 static inline int16 float_to_half_fast(float f) {
-    int32 x = intbits(f);
-    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    if (__have_native_half) {
+        return __float_to_half_varying(f);
+    }
+    else {
+        int32 x = intbits(f);
+        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
 
-    unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
-    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    unsigned int32 he = (hes << 10); // Exponent
-    int32 hm = (xm >> 13); // Mantissa
-    int32 ret = (hs | he | hm);
+        unsigned int32 hs = (xs >> 16); // Sign bit
+        // Exponent unbias the single, then bias the halfp
+        int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        unsigned int32 he = (hes << 10); // Exponent
+        int32 hm = (xm >> 13); // Mantissa
+        int32 ret = (hs | he | hm);
 
-    if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
-        ret += 1u; 
+        if (xm & 0x00001000u) // Check for rounding
+            // Round, might overflow to inf, this is OK
+            ret += 1u; 
 
-    return (int16)ret;
+        return (int16)ret;
+    }
 }
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/tests/half-3.ispc b/tests/half-3.ispc
new file mode 100644
index 00000000..47de0eee
--- /dev/null
+++ b/tests/half-3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_v(uniform float RET[]) {
+    int errors = 0;
+
+    foreach (i = 0 ... 65535) {
+        unsigned int16 h = i;
+        float f = half_to_float(i);
+        h = float_to_half(f);
+
+        int mismatches = (f == f && i != h);
+        errors += reduce_add(mismatches);
+    }
+
+    RET[programIndex] = errors;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}