Use native float/half conversion instructions with the AVX2 target.

2012-01-24 15:33:38 -08:00
parent a5b7fca7e0
commit 1867b5b317
12 changed files with 453 additions and 256 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -822,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
                           symbolTable);
    lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
                       module, symbolTable);
    if (includeStdlibISPC) {
        // If the user wants the standard library to be included, parse the
        // serialized version of the stdlib.ispc file to get its
--- a/builtins/target-avx1-x2.ll
+++ b/builtins/target-avx1-x2.ll
@@ -58,6 +58,14 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
  ret <16 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -58,6 +58,14 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
  ret <8 x i32> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -65,6 +73,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-avx2-x2.ll
+++ b/builtins/target-avx2-x2.ll
@@ -63,6 +63,61 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
  ret <16 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
  %r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
  %r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
  %r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x float> %r
 }
 define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
  %r_0 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
  %r_1 = shufflevector <16 x float> %v, <16 x float> undef,
             <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
  %r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1, 
           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -63,6 +63,44 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
  ret <8 x i32> %m
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float/half conversions
 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
 ; 0 is round nearest even
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
 define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
  %r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
  ret <8 x float> %r
 }
 define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
  %r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
  ret <8 x i16> %r
 }
 define float @__half_to_float_uniform(i16 %v) nounwind readnone {
  %v1 = bitcast i16 %v to <1 x i16>
  %vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  %rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
  %r = extractelement <8 x float> %rv, i32 0
  ret float %r
 }
 define i16 @__float_to_half_uniform(float %v) nounwind readnone {
  %v1 = bitcast float %v to <1 x float>
  %vv = shufflevector <1 x float> %v1, <1 x float> undef,
           <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
                      i32 undef, i32 undef, i32 undef, i32 undef>
  ; round to nearest even
  %rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
  %r = extractelement <8 x i16> %rv, i32 0
  ret i16 %r
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
@@ -70,6 +108,3 @@ gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,14 @@ int64minmax()
 include(`target-sse2-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
 ;;
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,14 @@ int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,14 @@ int64minmax()
 include(`target-sse4-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 8;
        t->vectorWidth = 8;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
        t->isa = Target::AVX2;
        t->nativeVectorWidth = 16;
        t->vectorWidth = 16;
-        t->attributes = "+avx2,+popcnt,+cmov";
+        t->attributes = "+avx2,+popcnt,+cmov,+f16c";
        t->maskingIsFree = false;
        t->allOffMaskIsSafe = false;
        t->maskBitCount = 32;
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2824,6 +2824,10 @@ static inline uniform double pow(uniform double a, uniform double b) {
 // half-precision floats
 static inline uniform float half_to_float(uniform unsigned int16 h) {
    if (__have_native_half) {
        return __half_to_float_uniform(h);
    }
    else {
        if ((h & 0x7FFFu) == 0) 
            // Signed zero
            return floatbits(((unsigned int32) h) << 16);
@@ -2879,8 +2883,13 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
            }
        }
    }
 }
 static inline float half_to_float(unsigned int16 h) {
    if (__have_native_half) {
        return __half_to_float_varying(h);
    }
    else {
        if ((h & 0x7FFFu) == 0) 
            // Signed zero
            return floatbits(((unsigned int32) h) << 16);
@@ -2936,9 +2945,14 @@ static inline float half_to_float(unsigned int16 h) {
            }
        }
    }
 }
 static inline uniform int16 float_to_half(uniform float f) {
    if (__have_native_half) {
        return __float_to_half_uniform(f);
    }
    else {
        uniform int32 x = intbits(f);
        // Store the return value in an int32 until the very end; this ends up
        // generating better code...
@@ -3002,9 +3016,14 @@ static inline uniform int16 float_to_half(uniform float f) {
        }
        return (int16)ret;
    }
 }
 static inline int16 float_to_half(float f) {
    if (__have_native_half) {
        return __float_to_half_varying(f);
    }
    else {
        int32 x = intbits(f);
        // Store the return value in an int32 until the very end; this ends up
        // generating better code...
@@ -3068,9 +3087,14 @@ static inline int16 float_to_half(float f) {
        }
        return (int16)ret;
    }
 }
 static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
    if (__have_native_half) {
        return __half_to_float_uniform(h);
    }
    else {
        uniform unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
        uniform unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        uniform unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
@@ -3084,10 +3108,14 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
        // Mantissa
        uniform unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
-
+    }
 }
 static inline float half_to_float_fast(unsigned int16 h) {
    if (__have_native_half) {
        return __half_to_float_varying(h);
    }
    else {
        unsigned int32 hs = h & (int32)0x8000u;  // Pick off sign bit
        unsigned int32 he = h & (int32)0x7C00u;  // Pick off exponent bits
        unsigned int32 hm = h & (int32)0x03FFu;  // Pick off mantissa bits
@@ -3101,10 +3129,14 @@ static inline float half_to_float_fast(unsigned int16 h) {
        // Mantissa
        unsigned int32 xm = ((unsigned int32) hm) << 13; 
        return floatbits(xs | xe | xm);
-
+    }
 }
 static inline uniform int16 float_to_half_fast(uniform float f) {
    if (__have_native_half) {
        return __float_to_half_uniform(f);
    }
    else {
        uniform int32 x = intbits(f);
        uniform unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        uniform unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
@@ -3123,8 +3155,13 @@ static inline uniform int16 float_to_half_fast(uniform float f) {
        return (int16)ret;
    }
 }
 static inline int16 float_to_half_fast(float f) {
    if (__have_native_half) {
        return __float_to_half_varying(f);
    }
    else {
        int32 x = intbits(f);
        unsigned int32 xs = x & 0x80000000u;  // Pick off sign bit
        unsigned int32 xe = x & 0x7F800000u;  // Pick off exponent bits
@@ -3143,6 +3180,7 @@ static inline int16 float_to_half_fast(float f) {
        return (int16)ret;
    }
 }
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
--- a/tests/half-3.ispc
+++ b/tests/half-3.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_v(uniform float RET[]) {
    int errors = 0;
    foreach (i = 0 ... 65535) {
        unsigned int16 h = i;
        float f = half_to_float(i);
        h = float_to_half(f);
        int mismatches = (f == f && i != h);
        errors += reduce_add(mismatches);
    }
    RET[programIndex] = errors;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 0;
 }