Use native float/half conversion instructions with the AVX2 target.

2012-01-24 15:33:38 -08:00
parent a5b7fca7e0
commit 1867b5b317
12 changed files with 453 additions and 256 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -822,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);
    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
                       module, symbolTable);
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -58,6 +58,14 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -58,6 +58,14 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -65,6 +73,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -63,6 +63,61 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
  ret <16 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x float> %r
 }
 define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -63,6 +63,44 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
  ret <8 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
  ret <8 x float> %r
 }
 define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
  ret <8 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -70,6 +108,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,14 @@ int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,14 @@ int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2824,114 +2824,124 @@ static inline uniform double pow(uniform double a, uniform double b) {
 // half-precision floats
 static inline uniform float half_to_float(uniform unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
+    if (__have_native_half) {
-        // Signed zero
+        return __half_to_float_uniform(h);
-        return floatbits(((unsigned int32) h) << 16);
+    }
    else {
-        // Though these are int16 quantities, we get much better code 
+        if ((h & 0x7FFFu) == 0) 
-        // with them stored as int32s...
+            // Signed zero
-        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            return floatbits(((unsigned int32) h) << 16);
        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
        if (he == 0) {  
            // Denormal will convert to normalized
            uniform int e = -1;
            // The following loop figures out how much extra to adjust the exponent
            // Shift until leading bit overflows into exponent bit
            do {
                e++;
                hm <<= 1;
            } while((hm & 0x0400u) == 0);
            // Sign bit
            uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
            // Exponent: unbias the halfp, then bias the single
            uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
            // Exponent
            uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
            // Mantissa
            uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
            return floatbits(xs | xe | xm);
        } 
        else {
-            if (he == 0x7C00u) {  
+            // Though these are int16 quantities, we get much better code 
-                // Inf or NaN (all the exponent bits are set)
+            // with them stored as int32s...
-                if (hm == 0)
+            uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-                    // Zero mantissa -> signed inf
+            uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-                    return floatbits((((unsigned int32) hs) << 16) | 
+            uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-                                     ((unsigned int32) 0x7F800000u));
+            if (he == 0) {  
-                else
+                // Denormal will convert to normalized
-                    // NaN
+                uniform int e = -1;
-                    return floatbits(0xFFC00000u);
+                // The following loop figures out how much extra to adjust the exponent
-            }
+                // Shift until leading bit overflows into exponent bit
-            else { 
+                do {
-                // Normalized number
+                    e++;
-                // sign
+                    hm <<= 1;
                } while((hm & 0x0400u) == 0);
                // Sign bit
                uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
                // Exponent: unbias the halfp, then bias the single
-                uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                // Exponent
-                uniform unsigned int32 xe = (unsigned int32) (xes << 23);
+                uniform unsigned int32 xe = (unsigned int32) (xes << 23); 
                // Mantissa
-                uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                return floatbits(xs | xe | xm);
            } 
            else {
                if (he == 0x7C00u) {  
                    // Inf or NaN (all the exponent bits are set)
                    if (hm == 0)
                        // Zero mantissa -> signed inf
                        return floatbits((((unsigned int32) hs) << 16) | 
                                         ((unsigned int32) 0x7F800000u));
                    else
                        // NaN
                        return floatbits(0xFFC00000u);
                }
                else { 
                    // Normalized number
                    // sign
                    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
                    // Exponent: unbias the halfp, then bias the single
                    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
                    // Exponent
                    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
                    // Mantissa
                    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
                    return floatbits(xs | xe | xm);
                }
            }
        }
    }
 }
 static inline float half_to_float(unsigned int16 h) {
-    if ((h & 0x7FFFu) == 0) 
+    if (__have_native_half) {
-        // Signed zero
+        return __half_to_float_varying(h);
-        return floatbits(((unsigned int32) h) << 16);
+    }
    else {
-        // Though these are int16 quantities, we get much better code 
+        if ((h & 0x7FFFu) == 0) 
-        // with them stored as int32s...
+            // Signed zero
-        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+            return floatbits(((unsigned int32) h) << 16);
        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
        cif (he == 0) {  
            // Denormal will convert to normalized
            int e = -1;
            // The following loop figures out how much extra to adjust the exponent
            // Shift until leading bit overflows into exponent bit
            do {
                e++;
                hm <<= 1;
            } while((hm & 0x0400u) == 0);
            // Sign bit
            unsigned int32 xs = ((unsigned int32) hs) << 16; 
            // Exponent: unbias the halfp, then bias the single
            int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
            // Exponent
            unsigned int32 xe = (unsigned int32) (xes << 23); 
            // Mantissa
            unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
            return floatbits(xs | xe | xm);
        } 
        else {
-            if (he == 0x7C00u) {  
+            // Though these are int16 quantities, we get much better code 
-                // Inf or NaN (all the exponent bits are set)
+            // with them stored as int32s...
-                if (hm == 0)
+            unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-                    // Zero mantissa -> signed inf
+            unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-                    return floatbits((((unsigned int32) hs) << 16) | 
+            unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
-                                     ((unsigned int32) 0x7F800000u));
+            cif (he == 0) {  
-                else
+                // Denormal will convert to normalized
-                    // NaN
+                int e = -1;
-                    return floatbits(0xFFC00000u);
+                // The following loop figures out how much extra to adjust the exponent
-            }
+                // Shift until leading bit overflows into exponent bit
-            else { 
+                do {
-                // Normalized number
+                    e++;
-                // sign
+                    hm <<= 1;
                } while((hm & 0x0400u) == 0);
                // Sign bit
                unsigned int32 xs = ((unsigned int32) hs) << 16; 
                // Exponent: unbias the halfp, then bias the single
-                int32 xes = ((int32) (he >> 10)) - 15 + 127; 
+                int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
                // Exponent
-                unsigned int32 xe = (unsigned int32) (xes << 23);
+                unsigned int32 xe = (unsigned int32) (xes << 23); 
                // Mantissa
-                unsigned int32 xm = ((unsigned int32) hm) << 13; 
+                unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; 
                return floatbits(xs | xe | xm);
            } 
            else {
                if (he == 0x7C00u) {  
                    // Inf or NaN (all the exponent bits are set)
                    if (hm == 0)
                        // Zero mantissa -> signed inf
                        return floatbits((((unsigned int32) hs) << 16) | 
                                         ((unsigned int32) 0x7F800000u));
                    else
                        // NaN
                        return floatbits(0xFFC00000u);
                }
                else { 
                    // Normalized number
                    // sign
                    unsigned int32 xs = ((unsigned int32) hs) << 16; 
                    // Exponent: unbias the halfp, then bias the single
                    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
                    // Exponent
                    unsigned int32 xe = (unsigned int32) (xes << 23);
                    // Mantissa
                    unsigned int32 xm = ((unsigned int32) hm) << 13; 
                    return floatbits(xs | xe | xm);
                }
            }
        }
    }
@@ -2939,209 +2949,237 @@ static inline float half_to_float(unsigned int16 h) {
 static inline uniform int16 float_to_half(uniform float f) {
-    uniform int32 x = intbits(f);
+    if (__have_native_half) {
-    // Store the return value in an int32 until the very end; this ends up
+        return __float_to_half_uniform(f);
-    // generating better code...
+    }
    uniform int32 ret;
    if ((x & 0x7FFFFFFFu) == 0)
        // Signed zero
        ret = (x >> 16); 
    else {
-        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        uniform int32 x = intbits(f);
-        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        // Store the return value in an int32 until the very end; this ends up
-        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+        // generating better code...
-        if (xe == 0) {  
+        uniform int32 ret;
-            // Denormal will underflow, return a signed zero
+        if ((x & 0x7FFFFFFFu) == 0)
-            ret = (xs >> 16);
+            // Signed zero
-        } 
+            ret = (x >> 16); 
        else {
-            if (xe == 0x7F800000u) {  
+            uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-                // Inf or NaN (all the exponent bits are set)
+            uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-                if (xm == 0)
+            uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-                    // Zero mantissa -> signed infinity
+            if (xe == 0) {  
-                    ret = ((xs >> 16) | 0x7C00u);
+                // Denormal will underflow, return a signed zero
-                else
+                ret = (xs >> 16);
-                    // NaN, only 1st mantissa bit set
+            } 
-                    ret = 0xFE00u;
+            else {
-            }
+                if (xe == 0x7F800000u) {  
-            else { 
+                    // Inf or NaN (all the exponent bits are set)
-                // Normalized number
+                    if (xm == 0)
-                uniform unsigned int32 hs = (xs >> 16); // Sign bit
+                        // Zero mantissa -> signed infinity
-                uniform unsigned int32 hm;
+                        ret = ((xs >> 16) | 0x7C00u);
-                // Exponent unbias the single, then bias the halfp
+                    else
-                uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                        // NaN, only 1st mantissa bit set
-                if (hes >= 0x1F)  
+                        ret = 0xFE00u;
-                    // Overflow: return signed infinity
+                }
-                    ret = ((xs >> 16) | 0x7C00u);
+                else { 
-                else if (hes <= 0) {
+                    // Normalized number
-                    // Underflow
+                    uniform unsigned int32 hs = (xs >> 16); // Sign bit
-                    if ((14 - hes) > 24) {  
+                    uniform unsigned int32 hm;
-                        // Mantissa shifted all the way off & no rounding possibility
+                    // Exponent unbias the single, then bias the halfp
-                        hm = 0u;  // Set mantissa to zero
+                    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
                    if (hes >= 0x1F)  
                        // Overflow: return signed infinity
                        ret = ((xs >> 16) | 0x7C00u);
                    else if (hes <= 0) {
                        // Underflow
                        if ((14 - hes) > 24) {  
                            // Mantissa shifted all the way off & no rounding possibility
                            hm = 0u;  // Set mantissa to zero
                        } 
                        else {
                            xm |= 0x00800000u;  // Add the hidden leading bit
                            hm = (xm >> (14 - hes)); // Mantissa
                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
                                // Round, might overflow into exp bit, but this is OK
                                hm += 1u; 
                        }
                        ret = (hs | hm);
                    } 
                    else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
+                        uniform unsigned int32 he = (hes << 10); // Exponent
-                        hm = (xm >> (14 - hes)); // Mantissa
+                        hm = (xm >> 13); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                        if (xm & 0x00001000u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
+                            // Round, might overflow to inf, this is OK
-                            hm += 1u; 
+                            ret = (hs | he | hm) + 1u; 
                        else
                            ret = (hs | he | hm);
                    }
                    ret = (hs | hm);
                } 
                else {
                    uniform unsigned int32 he = (hes << 10); // Exponent
                    hm = (xm >> 13); // Mantissa
                    if (xm & 0x00001000u) // Check for rounding
                        // Round, might overflow to inf, this is OK
                        ret = (hs | he | hm) + 1u; 
                    else
                        ret = (hs | he | hm);
                }
            }
        }
        return (int16)ret;
    }
    return (int16)ret;
 }
 static inline int16 float_to_half(float f) {
-    int32 x = intbits(f);
+    if (__have_native_half) {
-    // Store the return value in an int32 until the very end; this ends up
+        return __float_to_half_varying(f);
-    // generating better code...
+    }
    int32 ret;
    if ((x & 0x7FFFFFFFu) == 0)
        // Signed zero
        ret = (x >> 16); 
    else {
-        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        int32 x = intbits(f);
-        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+        // Store the return value in an int32 until the very end; this ends up
-        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+        // generating better code...
-        if (xe == 0) {  
+        int32 ret;
-            // Denormal will underflow, return a signed zero
+        if ((x & 0x7FFFFFFFu) == 0)
-            ret = (xs >> 16);
+            // Signed zero
-        } 
+            ret = (x >> 16); 
        else {
-            cif (xe == 0x7F800000u) {  
+            unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
-                // Inf or NaN (all the exponent bits are set)
+            unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
-                if (xm == 0)
+            unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-                    // Zero mantissa -> signed infinity
+            if (xe == 0) {  
-                    ret = ((xs >> 16) | 0x7C00u);
+                // Denormal will underflow, return a signed zero
-                else
+                ret = (xs >> 16);
-                    // NaN, only 1st mantissa bit set
+            } 
-                    ret = 0xFE00u;
+            else {
-            }
+                cif (xe == 0x7F800000u) {  
-            else { 
+                    // Inf or NaN (all the exponent bits are set)
-                // Normalized number
+                    if (xm == 0)
-                unsigned int32 hs = (xs >> 16); // Sign bit
+                        // Zero mantissa -> signed infinity
-                unsigned int32 hm;
+                        ret = ((xs >> 16) | 0x7C00u);
-                // Exponent unbias the single, then bias the halfp
+                    else
-                int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+                        // NaN, only 1st mantissa bit set
-                if (hes >= 0x1F)  
+                        ret = 0xFE00u;
-                    // Overflow: return signed infinity
+                }
-                    ret = ((xs >> 16) | 0x7C00u);
+                else { 
-                else if (hes <= 0) {
+                    // Normalized number
-                    // Underflow
+                    unsigned int32 hs = (xs >> 16); // Sign bit
-                    if ((14 - hes) > 24) {  
+                    unsigned int32 hm;
-                        // Mantissa shifted all the way off & no rounding possibility
+                    // Exponent unbias the single, then bias the halfp
-                        hm = 0u;  // Set mantissa to zero
+                    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
                    if (hes >= 0x1F)  
                        // Overflow: return signed infinity
                        ret = ((xs >> 16) | 0x7C00u);
                    else if (hes <= 0) {
                        // Underflow
                        if ((14 - hes) > 24) {  
                            // Mantissa shifted all the way off & no rounding possibility
                            hm = 0u;  // Set mantissa to zero
                        } 
                        else {
                            xm |= 0x00800000u;  // Add the hidden leading bit
                            hm = (xm >> (14 - hes)); // Mantissa
                            if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
                                // Round, might overflow into exp bit, but this is OK
                                hm += 1u; 
                        }
                        ret = (hs | hm);
                    } 
                    else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
+                        unsigned int32 he = (hes << 10); // Exponent
-                        hm = (xm >> (14 - hes)); // Mantissa
+                        hm = (xm >> 13); // Mantissa
-                        if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
+                        if (xm & 0x00001000u) // Check for rounding
-                            // Round, might overflow into exp bit, but this is OK
+                            // Round, might overflow to inf, this is OK
-                            hm += 1u; 
+                            ret = (hs | he | hm) + 1u; 
                        else
                            ret = (hs | he | hm);
                    }
                    ret = (hs | hm);
                } 
                else {
                    unsigned int32 he = (hes << 10); // Exponent
                    hm = (xm >> 13); // Mantissa
                    if (xm & 0x00001000u) // Check for rounding
                        // Round, might overflow to inf, this is OK
                        ret = (hs | he | hm) + 1u; 
                    else
                        ret = (hs | he | hm);
                }
            }
        }
        return (int16)ret;
    }
    return (int16)ret;
 }
 static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
-    uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    if (__have_native_half) {
-    uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        return __half_to_float_uniform(h);
-    uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+    }
-
+    else {
-    // sign
+        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    // Exponent: unbias the halfp, then bias the single
+        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
    uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
    // Exponent
    uniform unsigned int32 xe = (unsigned int32) (xes << 23);
    // Mantissa
    uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
    return floatbits(xs | xe | xm);
        // sign
        uniform unsigned int32 xs = ((unsigned int32) hs) << 16; 
        // Exponent: unbias the halfp, then bias the single
        uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; 
        // Exponent
        uniform unsigned int32 xe = (unsigned int32) (xes << 23);
        // Mantissa
        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
    }
 }
 static inline float half_to_float_fast(unsigned int16 h) {
-    unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
+    if (__have_native_half) {
-    unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
+        return __half_to_float_varying(h);
-    unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
+    }
-
+    else {
-    // sign
+        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
-    unsigned int32 xs = ((unsigned int32) hs) << 16; 
+        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
-    // Exponent: unbias the halfp, then bias the single
+        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
    int32 xes = ((int32) (he >> 10)) - 15 + 127; 
    // Exponent
    unsigned int32 xe = (unsigned int32) (xes << 23);
    // Mantissa
    unsigned int32 xm = ((unsigned int32) hm) << 13; 
    return floatbits(xs | xe | xm);
        // sign
        unsigned int32 xs = ((unsigned int32) hs) << 16; 
        // Exponent: unbias the halfp, then bias the single
        int32 xes = ((int32) (he >> 10)) - 15 + 127; 
        // Exponent
        unsigned int32 xe = (unsigned int32) (xes << 23);
        // Mantissa
        unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
    }
 }
 static inline uniform int16 float_to_half_fast(uniform float f) {
-    uniform int32 x = intbits(f);
+    if (__have_native_half) {
-    uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        return __float_to_half_uniform(f);
-    uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    }
-    uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    else {
        uniform int32 x = intbits(f);
        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
        uniform unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-    uniform unsigned int32 hs = (xs >> 16); // Sign bit
+        uniform unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
+        // Exponent unbias the single, then bias the halfp
-    uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    uniform unsigned int32 he = (hes << 10); // Exponent
+        uniform unsigned int32 he = (hes << 10); // Exponent
-    uniform int32 hm = (xm >> 13); // Mantissa
+        uniform int32 hm = (xm >> 13); // Mantissa
-    uniform int32 ret = (hs | he | hm);
+        uniform int32 ret = (hs | he | hm);
-    if (xm & 0x00001000u) // Check for rounding
+        if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
+            // Round, might overflow to inf, this is OK
-        ret += 1u; 
+            ret += 1u; 
-    return (int16)ret;
+        return (int16)ret;
    }
 }
 static inline int16 float_to_half_fast(float f) {
-    int32 x = intbits(f);
+    if (__have_native_half) {
-    unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
+        return __float_to_half_varying(f);
-    unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
+    }
-    unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
+    else {
        int32 x = intbits(f);
        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
        unsigned int32 xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-    unsigned int32 hs = (xs >> 16); // Sign bit
+        unsigned int32 hs = (xs >> 16); // Sign bit
-    // Exponent unbias the single, then bias the halfp
+        // Exponent unbias the single, then bias the halfp
-    int32 hes = ((int)(xe >> 23)) - 127 + 15; 
+        int32 hes = ((int)(xe >> 23)) - 127 + 15; 
-    unsigned int32 he = (hes << 10); // Exponent
+        unsigned int32 he = (hes << 10); // Exponent
-    int32 hm = (xm >> 13); // Mantissa
+        int32 hm = (xm >> 13); // Mantissa
-    int32 ret = (hs | he | hm);
+        int32 ret = (hs | he | hm);
-    if (xm & 0x00001000u) // Check for rounding
+        if (xm & 0x00001000u) // Check for rounding
-        // Round, might overflow to inf, this is OK
+            // Round, might overflow to inf, this is OK
-        ret += 1u; 
+            ret += 1u; 
-    return (int16)ret;
+        return (int16)ret;
    }
 }
 ///////////////////////////////////////////////////////////////////////////
--- a/tests/half-3.ispc
+++ b/tests/half-3.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_v(uniform float RET[]) {
    int errors = 0;
    foreach (i = 0 ... 65535) {
        unsigned int16 h = i;
        float f = half_to_float(i);
        h = float_to_half(f);
        int mismatches = (f == f && i != h);
        errors += reduce_add(mismatches);
    }
    RET[programIndex] = errors;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 0;
 }