From 1867b5b317a099d02bdf39cc3ff669affbe83e87 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Tue, 24 Jan 2012 15:33:38 -0800 Subject: [PATCH] Use native float/half conversion instructions with the AVX2 target. --- builtins.cpp | 3 + builtins/target-avx1-x2.ll | 8 + builtins/target-avx1.ll | 11 +- builtins/target-avx2-x2.ll | 55 ++++ builtins/target-avx2.ll | 41 ++- builtins/target-sse2-x2.ll | 8 + builtins/target-sse2.ll | 8 + builtins/target-sse4-x2.ll | 8 + builtins/target-sse4.ll | 8 + ispc.cpp | 4 +- stdlib.ispc | 534 ++++++++++++++++++++----------------- tests/half-3.ispc | 21 ++ 12 files changed, 453 insertions(+), 256 deletions(-) create mode 100644 tests/half-3.ispc diff --git a/builtins.cpp b/builtins.cpp index 2608e031..76ebdfa7 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -822,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module, symbolTable); + lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2), + module, symbolTable); + if (includeStdlibISPC) { // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll index d05da95f..36f47cec 100644 --- a/builtins/target-avx1-x2.ll +++ b/builtins/target-avx1-x2.ll @@ -58,6 +58,14 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl ret <16 x i32> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 137ddf00..e46fc3b4 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -58,6 +58,14 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a ret <8 x i32> %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather @@ -65,6 +73,3 @@ gen_gather(8, i8) gen_gather(8, i16) gen_gather(8, i32) gen_gather(8, i64) - - - diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index fa4f345f..e4d3f686 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -63,6 +63,61 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl ret <16 x i32> %m } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone { + %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef, + <8 x i32> + %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0) + %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef, + <8 x i32> + %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1) + %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, + <16 x i32> + ret <16 x float> %r +} + +define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone { + %r_0 = shufflevector <16 x float> %v, <16 x float> undef, + <8 x i32> + %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0) + %r_1 = shufflevector <16 x float> %v, <16 x float> undef, + <8 x i32> + %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0) + %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, + <16 x i32> + ret <16 x i16> %r +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index c812ede1..66b2a23e 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -63,6 +63,44 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a ret <8 x i32> %m } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float/half conversions + +declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone +; 0 is round nearest even +declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone + +define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone { + %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v) + ret <8 x float> %r +} + +define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone { + %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0) + ret <8 x i16> %r +} + +define float @__half_to_float_uniform(i16 %v) nounwind readnone { + %v1 = bitcast i16 %v to <1 x i16> + %vv = shufflevector <1 x i16> %v1, <1 x i16> undef, + <8 x i32> + %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv) + %r = extractelement <8 x float> %rv, i32 0 + ret float %r +} + +define i16 @__float_to_half_uniform(float %v) nounwind readnone { + %v1 = bitcast float %v to <1 x float> + %vv = shufflevector <1 x float> %v1, <1 x float> undef, + <8 x i32> + ; round to nearest even + %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0) + %r = extractelement <8 x i16> %rv, i32 0 + ret i16 %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather @@ -70,6 +108,3 @@ gen_gather(8, i8) gen_gather(8, i16) gen_gather(8, i32) gen_gather(8, i64) - - - diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index c0030f31..2e6d1bdc 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -47,6 +47,14 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 8d9911d8..21ffb267 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -44,6 +44,14 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding ;; diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index b7cd36ec..5a467ec2 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -47,6 +47,14 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 68ff49d9..9dfe9db7 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -44,6 +44,14 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/ispc.cpp b/ispc.cpp index 523927fc..7fbc5bc6 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->isa = Target::AVX2; t->nativeVectorWidth = 8; t->vectorWidth = 8; - t->attributes = "+avx2,+popcnt,+cmov"; + t->attributes = "+avx2,+popcnt,+cmov,+f16c"; t->maskingIsFree = false; t->allOffMaskIsSafe = false; t->maskBitCount = 32; @@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->isa = Target::AVX2; t->nativeVectorWidth = 16; t->vectorWidth = 16; - t->attributes = "+avx2,+popcnt,+cmov"; + t->attributes = "+avx2,+popcnt,+cmov,+f16c"; t->maskingIsFree = false; t->allOffMaskIsSafe = false; t->maskBitCount = 32; diff --git a/stdlib.ispc b/stdlib.ispc index 8a7daf49..6cb7e732 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2824,114 +2824,124 @@ static inline uniform double pow(uniform double a, uniform double b) { // half-precision floats static inline uniform float half_to_float(uniform unsigned int16 h) { - if ((h & 0x7FFFu) == 0) - // Signed zero - return floatbits(((unsigned int32) h) << 16); + if (__have_native_half) { + return __half_to_float_uniform(h); + } else { - // Though these are int16 quantities, we get much better code - // with them stored as int32s... - uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit - uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits - uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits - if (he == 0) { - // Denormal will convert to normalized - uniform int e = -1; - // The following loop figures out how much extra to adjust the exponent - // Shift until leading bit overflows into exponent bit - do { - e++; - hm <<= 1; - } while((hm & 0x0400u) == 0); - - // Sign bit - uniform unsigned int32 xs = ((unsigned int32) hs) << 16; - // Exponent: unbias the halfp, then bias the single - uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e; - // Exponent - uniform unsigned int32 xe = (unsigned int32) (xes << 23); - // Mantissa - uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; - return floatbits(xs | xe | xm); - } + if ((h & 0x7FFFu) == 0) + // Signed zero + return floatbits(((unsigned int32) h) << 16); else { - if (he == 0x7C00u) { - // Inf or NaN (all the exponent bits are set) - if (hm == 0) - // Zero mantissa -> signed inf - return floatbits((((unsigned int32) hs) << 16) | - ((unsigned int32) 0x7F800000u)); - else - // NaN - return floatbits(0xFFC00000u); - } - else { - // Normalized number - // sign + // Though these are int16 quantities, we get much better code + // with them stored as int32s... + uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit + uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits + uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits + if (he == 0) { + // Denormal will convert to normalized + uniform int e = -1; + // The following loop figures out how much extra to adjust the exponent + // Shift until leading bit overflows into exponent bit + do { + e++; + hm <<= 1; + } while((hm & 0x0400u) == 0); + + // Sign bit uniform unsigned int32 xs = ((unsigned int32) hs) << 16; // Exponent: unbias the halfp, then bias the single - uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; + uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e; // Exponent - uniform unsigned int32 xe = (unsigned int32) (xes << 23); + uniform unsigned int32 xe = (unsigned int32) (xes << 23); // Mantissa - uniform unsigned int32 xm = ((unsigned int32) hm) << 13; + uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; return floatbits(xs | xe | xm); + } + else { + if (he == 0x7C00u) { + // Inf or NaN (all the exponent bits are set) + if (hm == 0) + // Zero mantissa -> signed inf + return floatbits((((unsigned int32) hs) << 16) | + ((unsigned int32) 0x7F800000u)); + else + // NaN + return floatbits(0xFFC00000u); + } + else { + // Normalized number + // sign + uniform unsigned int32 xs = ((unsigned int32) hs) << 16; + // Exponent: unbias the halfp, then bias the single + uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; + // Exponent + uniform unsigned int32 xe = (unsigned int32) (xes << 23); + // Mantissa + uniform unsigned int32 xm = ((unsigned int32) hm) << 13; + return floatbits(xs | xe | xm); + } } } } } static inline float half_to_float(unsigned int16 h) { - if ((h & 0x7FFFu) == 0) - // Signed zero - return floatbits(((unsigned int32) h) << 16); + if (__have_native_half) { + return __half_to_float_varying(h); + } else { - // Though these are int16 quantities, we get much better code - // with them stored as int32s... - unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit - unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits - unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits - cif (he == 0) { - // Denormal will convert to normalized - int e = -1; - // The following loop figures out how much extra to adjust the exponent - // Shift until leading bit overflows into exponent bit - do { - e++; - hm <<= 1; - } while((hm & 0x0400u) == 0); - - // Sign bit - unsigned int32 xs = ((unsigned int32) hs) << 16; - // Exponent: unbias the halfp, then bias the single - int32 xes = ((int32)(he >> 10)) - 15 + 127 - e; - // Exponent - unsigned int32 xe = (unsigned int32) (xes << 23); - // Mantissa - unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; - return floatbits(xs | xe | xm); - } + if ((h & 0x7FFFu) == 0) + // Signed zero + return floatbits(((unsigned int32) h) << 16); else { - if (he == 0x7C00u) { - // Inf or NaN (all the exponent bits are set) - if (hm == 0) - // Zero mantissa -> signed inf - return floatbits((((unsigned int32) hs) << 16) | - ((unsigned int32) 0x7F800000u)); - else - // NaN - return floatbits(0xFFC00000u); - } - else { - // Normalized number - // sign + // Though these are int16 quantities, we get much better code + // with them stored as int32s... + unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit + unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits + unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits + cif (he == 0) { + // Denormal will convert to normalized + int e = -1; + // The following loop figures out how much extra to adjust the exponent + // Shift until leading bit overflows into exponent bit + do { + e++; + hm <<= 1; + } while((hm & 0x0400u) == 0); + + // Sign bit unsigned int32 xs = ((unsigned int32) hs) << 16; // Exponent: unbias the halfp, then bias the single - int32 xes = ((int32) (he >> 10)) - 15 + 127; + int32 xes = ((int32)(he >> 10)) - 15 + 127 - e; // Exponent - unsigned int32 xe = (unsigned int32) (xes << 23); + unsigned int32 xe = (unsigned int32) (xes << 23); // Mantissa - unsigned int32 xm = ((unsigned int32) hm) << 13; + unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13; return floatbits(xs | xe | xm); + } + else { + if (he == 0x7C00u) { + // Inf or NaN (all the exponent bits are set) + if (hm == 0) + // Zero mantissa -> signed inf + return floatbits((((unsigned int32) hs) << 16) | + ((unsigned int32) 0x7F800000u)); + else + // NaN + return floatbits(0xFFC00000u); + } + else { + // Normalized number + // sign + unsigned int32 xs = ((unsigned int32) hs) << 16; + // Exponent: unbias the halfp, then bias the single + int32 xes = ((int32) (he >> 10)) - 15 + 127; + // Exponent + unsigned int32 xe = (unsigned int32) (xes << 23); + // Mantissa + unsigned int32 xm = ((unsigned int32) hm) << 13; + return floatbits(xs | xe | xm); + } } } } @@ -2939,209 +2949,237 @@ static inline float half_to_float(unsigned int16 h) { static inline uniform int16 float_to_half(uniform float f) { - uniform int32 x = intbits(f); - // Store the return value in an int32 until the very end; this ends up - // generating better code... - uniform int32 ret; - if ((x & 0x7FFFFFFFu) == 0) - // Signed zero - ret = (x >> 16); + if (__have_native_half) { + return __float_to_half_uniform(f); + } else { - uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit - uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits - uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits - if (xe == 0) { - // Denormal will underflow, return a signed zero - ret = (xs >> 16); - } + uniform int32 x = intbits(f); + // Store the return value in an int32 until the very end; this ends up + // generating better code... + uniform int32 ret; + if ((x & 0x7FFFFFFFu) == 0) + // Signed zero + ret = (x >> 16); else { - if (xe == 0x7F800000u) { - // Inf or NaN (all the exponent bits are set) - if (xm == 0) - // Zero mantissa -> signed infinity - ret = ((xs >> 16) | 0x7C00u); - else - // NaN, only 1st mantissa bit set - ret = 0xFE00u; - } - else { - // Normalized number - uniform unsigned int32 hs = (xs >> 16); // Sign bit - uniform unsigned int32 hm; - // Exponent unbias the single, then bias the halfp - uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; - if (hes >= 0x1F) - // Overflow: return signed infinity - ret = ((xs >> 16) | 0x7C00u); - else if (hes <= 0) { - // Underflow - if ((14 - hes) > 24) { - // Mantissa shifted all the way off & no rounding possibility - hm = 0u; // Set mantissa to zero + uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit + uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits + uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits + if (xe == 0) { + // Denormal will underflow, return a signed zero + ret = (xs >> 16); + } + else { + if (xe == 0x7F800000u) { + // Inf or NaN (all the exponent bits are set) + if (xm == 0) + // Zero mantissa -> signed infinity + ret = ((xs >> 16) | 0x7C00u); + else + // NaN, only 1st mantissa bit set + ret = 0xFE00u; + } + else { + // Normalized number + uniform unsigned int32 hs = (xs >> 16); // Sign bit + uniform unsigned int32 hm; + // Exponent unbias the single, then bias the halfp + uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; + if (hes >= 0x1F) + // Overflow: return signed infinity + ret = ((xs >> 16) | 0x7C00u); + else if (hes <= 0) { + // Underflow + if ((14 - hes) > 24) { + // Mantissa shifted all the way off & no rounding possibility + hm = 0u; // Set mantissa to zero + } + else { + xm |= 0x00800000u; // Add the hidden leading bit + hm = (xm >> (14 - hes)); // Mantissa + if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding + // Round, might overflow into exp bit, but this is OK + hm += 1u; + } + ret = (hs | hm); } else { - xm |= 0x00800000u; // Add the hidden leading bit - hm = (xm >> (14 - hes)); // Mantissa - if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding - // Round, might overflow into exp bit, but this is OK - hm += 1u; + uniform unsigned int32 he = (hes << 10); // Exponent + hm = (xm >> 13); // Mantissa + if (xm & 0x00001000u) // Check for rounding + // Round, might overflow to inf, this is OK + ret = (hs | he | hm) + 1u; + else + ret = (hs | he | hm); } - ret = (hs | hm); - } - else { - uniform unsigned int32 he = (hes << 10); // Exponent - hm = (xm >> 13); // Mantissa - if (xm & 0x00001000u) // Check for rounding - // Round, might overflow to inf, this is OK - ret = (hs | he | hm) + 1u; - else - ret = (hs | he | hm); } } } + return (int16)ret; } - return (int16)ret; } static inline int16 float_to_half(float f) { - int32 x = intbits(f); - // Store the return value in an int32 until the very end; this ends up - // generating better code... - int32 ret; - if ((x & 0x7FFFFFFFu) == 0) - // Signed zero - ret = (x >> 16); + if (__have_native_half) { + return __float_to_half_varying(f); + } else { - unsigned int32 xs = x & 0x80000000u; // Pick off sign bit - unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits - unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits - if (xe == 0) { - // Denormal will underflow, return a signed zero - ret = (xs >> 16); - } + int32 x = intbits(f); + // Store the return value in an int32 until the very end; this ends up + // generating better code... + int32 ret; + if ((x & 0x7FFFFFFFu) == 0) + // Signed zero + ret = (x >> 16); else { - cif (xe == 0x7F800000u) { - // Inf or NaN (all the exponent bits are set) - if (xm == 0) - // Zero mantissa -> signed infinity - ret = ((xs >> 16) | 0x7C00u); - else - // NaN, only 1st mantissa bit set - ret = 0xFE00u; - } - else { - // Normalized number - unsigned int32 hs = (xs >> 16); // Sign bit - unsigned int32 hm; - // Exponent unbias the single, then bias the halfp - int32 hes = ((int)(xe >> 23)) - 127 + 15; - if (hes >= 0x1F) - // Overflow: return signed infinity - ret = ((xs >> 16) | 0x7C00u); - else if (hes <= 0) { - // Underflow - if ((14 - hes) > 24) { - // Mantissa shifted all the way off & no rounding possibility - hm = 0u; // Set mantissa to zero + unsigned int32 xs = x & 0x80000000u; // Pick off sign bit + unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits + unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits + if (xe == 0) { + // Denormal will underflow, return a signed zero + ret = (xs >> 16); + } + else { + cif (xe == 0x7F800000u) { + // Inf or NaN (all the exponent bits are set) + if (xm == 0) + // Zero mantissa -> signed infinity + ret = ((xs >> 16) | 0x7C00u); + else + // NaN, only 1st mantissa bit set + ret = 0xFE00u; + } + else { + // Normalized number + unsigned int32 hs = (xs >> 16); // Sign bit + unsigned int32 hm; + // Exponent unbias the single, then bias the halfp + int32 hes = ((int)(xe >> 23)) - 127 + 15; + if (hes >= 0x1F) + // Overflow: return signed infinity + ret = ((xs >> 16) | 0x7C00u); + else if (hes <= 0) { + // Underflow + if ((14 - hes) > 24) { + // Mantissa shifted all the way off & no rounding possibility + hm = 0u; // Set mantissa to zero + } + else { + xm |= 0x00800000u; // Add the hidden leading bit + hm = (xm >> (14 - hes)); // Mantissa + if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding + // Round, might overflow into exp bit, but this is OK + hm += 1u; + } + ret = (hs | hm); } else { - xm |= 0x00800000u; // Add the hidden leading bit - hm = (xm >> (14 - hes)); // Mantissa - if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding - // Round, might overflow into exp bit, but this is OK - hm += 1u; + unsigned int32 he = (hes << 10); // Exponent + hm = (xm >> 13); // Mantissa + if (xm & 0x00001000u) // Check for rounding + // Round, might overflow to inf, this is OK + ret = (hs | he | hm) + 1u; + else + ret = (hs | he | hm); } - ret = (hs | hm); - } - else { - unsigned int32 he = (hes << 10); // Exponent - hm = (xm >> 13); // Mantissa - if (xm & 0x00001000u) // Check for rounding - // Round, might overflow to inf, this is OK - ret = (hs | he | hm) + 1u; - else - ret = (hs | he | hm); } } } + return (int16)ret; } - return (int16)ret; } static inline uniform float half_to_float_fast(uniform unsigned int16 h) { - uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit - uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits - uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits - - // sign - uniform unsigned int32 xs = ((unsigned int32) hs) << 16; - // Exponent: unbias the halfp, then bias the single - uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; - // Exponent - uniform unsigned int32 xe = (unsigned int32) (xes << 23); - // Mantissa - uniform unsigned int32 xm = ((unsigned int32) hm) << 13; - return floatbits(xs | xe | xm); + if (__have_native_half) { + return __half_to_float_uniform(h); + } + else { + uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit + uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits + uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits + // sign + uniform unsigned int32 xs = ((unsigned int32) hs) << 16; + // Exponent: unbias the halfp, then bias the single + uniform int32 xes = ((int32) (he >> 10)) - 15 + 127; + // Exponent + uniform unsigned int32 xe = (unsigned int32) (xes << 23); + // Mantissa + uniform unsigned int32 xm = ((unsigned int32) hm) << 13; + return floatbits(xs | xe | xm); + } } static inline float half_to_float_fast(unsigned int16 h) { - unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit - unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits - unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits - - // sign - unsigned int32 xs = ((unsigned int32) hs) << 16; - // Exponent: unbias the halfp, then bias the single - int32 xes = ((int32) (he >> 10)) - 15 + 127; - // Exponent - unsigned int32 xe = (unsigned int32) (xes << 23); - // Mantissa - unsigned int32 xm = ((unsigned int32) hm) << 13; - return floatbits(xs | xe | xm); + if (__have_native_half) { + return __half_to_float_varying(h); + } + else { + unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit + unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits + unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits + // sign + unsigned int32 xs = ((unsigned int32) hs) << 16; + // Exponent: unbias the halfp, then bias the single + int32 xes = ((int32) (he >> 10)) - 15 + 127; + // Exponent + unsigned int32 xe = (unsigned int32) (xes << 23); + // Mantissa + unsigned int32 xm = ((unsigned int32) hm) << 13; + return floatbits(xs | xe | xm); + } } static inline uniform int16 float_to_half_fast(uniform float f) { - uniform int32 x = intbits(f); - uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit - uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits - uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits + if (__have_native_half) { + return __float_to_half_uniform(f); + } + else { + uniform int32 x = intbits(f); + uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit + uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits + uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits - uniform unsigned int32 hs = (xs >> 16); // Sign bit - // Exponent unbias the single, then bias the halfp - uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; - uniform unsigned int32 he = (hes << 10); // Exponent - uniform int32 hm = (xm >> 13); // Mantissa - uniform int32 ret = (hs | he | hm); + uniform unsigned int32 hs = (xs >> 16); // Sign bit + // Exponent unbias the single, then bias the halfp + uniform int32 hes = ((int)(xe >> 23)) - 127 + 15; + uniform unsigned int32 he = (hes << 10); // Exponent + uniform int32 hm = (xm >> 13); // Mantissa + uniform int32 ret = (hs | he | hm); - if (xm & 0x00001000u) // Check for rounding - // Round, might overflow to inf, this is OK - ret += 1u; + if (xm & 0x00001000u) // Check for rounding + // Round, might overflow to inf, this is OK + ret += 1u; - return (int16)ret; + return (int16)ret; + } } static inline int16 float_to_half_fast(float f) { - int32 x = intbits(f); - unsigned int32 xs = x & 0x80000000u; // Pick off sign bit - unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits - unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits + if (__have_native_half) { + return __float_to_half_varying(f); + } + else { + int32 x = intbits(f); + unsigned int32 xs = x & 0x80000000u; // Pick off sign bit + unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits + unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits - unsigned int32 hs = (xs >> 16); // Sign bit - // Exponent unbias the single, then bias the halfp - int32 hes = ((int)(xe >> 23)) - 127 + 15; - unsigned int32 he = (hes << 10); // Exponent - int32 hm = (xm >> 13); // Mantissa - int32 ret = (hs | he | hm); + unsigned int32 hs = (xs >> 16); // Sign bit + // Exponent unbias the single, then bias the halfp + int32 hes = ((int)(xe >> 23)) - 127 + 15; + unsigned int32 he = (hes << 10); // Exponent + int32 hm = (xm >> 13); // Mantissa + int32 ret = (hs | he | hm); - if (xm & 0x00001000u) // Check for rounding - // Round, might overflow to inf, this is OK - ret += 1u; + if (xm & 0x00001000u) // Check for rounding + // Round, might overflow to inf, this is OK + ret += 1u; - return (int16)ret; + return (int16)ret; + } } /////////////////////////////////////////////////////////////////////////// diff --git a/tests/half-3.ispc b/tests/half-3.ispc new file mode 100644 index 00000000..47de0eee --- /dev/null +++ b/tests/half-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_v(uniform float RET[]) { + int errors = 0; + + foreach (i = 0 ... 65535) { + unsigned int16 h = i; + float f = half_to_float(i); + h = float_to_half(f); + + int mismatches = (f == f && i != h); + errors += reduce_add(mismatches); + } + + RET[programIndex] = errors; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +}