Use native float/half conversion instructions with the AVX2 target.

This commit is contained in:
Matt Pharr
2012-01-24 15:33:38 -08:00
parent a5b7fca7e0
commit 1867b5b317
12 changed files with 453 additions and 256 deletions

View File

@@ -822,6 +822,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module, lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
symbolTable); symbolTable);
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
module, symbolTable);
if (includeStdlibISPC) { if (includeStdlibISPC) {
// If the user wants the standard library to be included, parse the // If the user wants the standard library to be included, parse the
// serialized version of the stdlib.ispc file to get its // serialized version of the stdlib.ispc file to get its

View File

@@ -58,6 +58,14 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
ret <16 x i32> %ret ret <16 x i32> %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather ;; gather

View File

@@ -58,6 +58,14 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
ret <8 x i32> %ret ret <8 x i32> %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather ;; gather
@@ -65,6 +73,3 @@ gen_gather(8, i8)
gen_gather(8, i16) gen_gather(8, i16)
gen_gather(8, i32) gen_gather(8, i32)
gen_gather(8, i64) gen_gather(8, i64)

View File

@@ -63,6 +63,61 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
ret <16 x i32> %m ret <16 x i32> %m
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
%r_0 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_0)
%r_1 = shufflevector <16 x i16> %v, <16 x i16> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %r_1)
%r = shufflevector <8 x float> %vr_0, <8 x float> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %r
}
define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
%r_0 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%vr_0 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_0, i32 0)
%r_1 = shufflevector <16 x float> %v, <16 x float> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%vr_1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %r_1, i32 0)
%r = shufflevector <8 x i16> %vr_0, <8 x i16> %vr_1,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather ;; gather

View File

@@ -63,6 +63,44 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
ret <8 x i32> %m ret <8 x i32> %m
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
%r = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %v)
ret <8 x float> %r
}
define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
%r = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %v, i32 0)
ret <8 x i16> %r
}
define float @__half_to_float_uniform(i16 %v) nounwind readnone {
%v1 = bitcast i16 %v to <1 x i16>
%vv = shufflevector <1 x i16> %v1, <1 x i16> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%rv = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %vv)
%r = extractelement <8 x float> %rv, i32 0
ret float %r
}
define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%v1 = bitcast float %v to <1 x float>
%vv = shufflevector <1 x float> %v1, <1 x float> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
; round to nearest even
%rv = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %vv, i32 0)
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather ;; gather
@@ -70,6 +108,3 @@ gen_gather(8, i8)
gen_gather(8, i16) gen_gather(8, i16)
gen_gather(8, i32) gen_gather(8, i32)
gen_gather(8, i64) gen_gather(8, i64)

View File

@@ -47,6 +47,14 @@ int64minmax()
include(`target-sse2-common.ll') include(`target-sse2-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp

View File

@@ -44,6 +44,14 @@ int64minmax()
include(`target-sse2-common.ll') include(`target-sse2-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding ;; rounding
;; ;;

View File

@@ -47,6 +47,14 @@ int64minmax()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp

View File

@@ -44,6 +44,14 @@ int64minmax()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; half conversion routines
declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp ;; rcp

View File

@@ -210,7 +210,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->isa = Target::AVX2; t->isa = Target::AVX2;
t->nativeVectorWidth = 8; t->nativeVectorWidth = 8;
t->vectorWidth = 8; t->vectorWidth = 8;
t->attributes = "+avx2,+popcnt,+cmov"; t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->maskingIsFree = false; t->maskingIsFree = false;
t->allOffMaskIsSafe = false; t->allOffMaskIsSafe = false;
t->maskBitCount = 32; t->maskBitCount = 32;
@@ -219,7 +219,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->isa = Target::AVX2; t->isa = Target::AVX2;
t->nativeVectorWidth = 16; t->nativeVectorWidth = 16;
t->vectorWidth = 16; t->vectorWidth = 16;
t->attributes = "+avx2,+popcnt,+cmov"; t->attributes = "+avx2,+popcnt,+cmov,+f16c";
t->maskingIsFree = false; t->maskingIsFree = false;
t->allOffMaskIsSafe = false; t->allOffMaskIsSafe = false;
t->maskBitCount = 32; t->maskBitCount = 32;

View File

@@ -2824,6 +2824,10 @@ static inline uniform double pow(uniform double a, uniform double b) {
// half-precision floats // half-precision floats
static inline uniform float half_to_float(uniform unsigned int16 h) { static inline uniform float half_to_float(uniform unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_uniform(h);
}
else {
if ((h & 0x7FFFu) == 0) if ((h & 0x7FFFu) == 0)
// Signed zero // Signed zero
return floatbits(((unsigned int32) h) << 16); return floatbits(((unsigned int32) h) << 16);
@@ -2879,8 +2883,13 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
} }
} }
} }
}
static inline float half_to_float(unsigned int16 h) { static inline float half_to_float(unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_varying(h);
}
else {
if ((h & 0x7FFFu) == 0) if ((h & 0x7FFFu) == 0)
// Signed zero // Signed zero
return floatbits(((unsigned int32) h) << 16); return floatbits(((unsigned int32) h) << 16);
@@ -2936,9 +2945,14 @@ static inline float half_to_float(unsigned int16 h) {
} }
} }
} }
}
static inline uniform int16 float_to_half(uniform float f) { static inline uniform int16 float_to_half(uniform float f) {
if (__have_native_half) {
return __float_to_half_uniform(f);
}
else {
uniform int32 x = intbits(f); uniform int32 x = intbits(f);
// Store the return value in an int32 until the very end; this ends up // Store the return value in an int32 until the very end; this ends up
// generating better code... // generating better code...
@@ -3002,9 +3016,14 @@ static inline uniform int16 float_to_half(uniform float f) {
} }
return (int16)ret; return (int16)ret;
} }
}
static inline int16 float_to_half(float f) { static inline int16 float_to_half(float f) {
if (__have_native_half) {
return __float_to_half_varying(f);
}
else {
int32 x = intbits(f); int32 x = intbits(f);
// Store the return value in an int32 until the very end; this ends up // Store the return value in an int32 until the very end; this ends up
// generating better code... // generating better code...
@@ -3068,9 +3087,14 @@ static inline int16 float_to_half(float f) {
} }
return (int16)ret; return (int16)ret;
} }
}
static inline uniform float half_to_float_fast(uniform unsigned int16 h) { static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_uniform(h);
}
else {
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
@@ -3084,10 +3108,14 @@ static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
// Mantissa // Mantissa
uniform unsigned int32 xm = ((unsigned int32) hm) << 13; uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
return floatbits(xs | xe | xm); return floatbits(xs | xe | xm);
}
} }
static inline float half_to_float_fast(unsigned int16 h) { static inline float half_to_float_fast(unsigned int16 h) {
if (__have_native_half) {
return __half_to_float_varying(h);
}
else {
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
@@ -3101,10 +3129,14 @@ static inline float half_to_float_fast(unsigned int16 h) {
// Mantissa // Mantissa
unsigned int32 xm = ((unsigned int32) hm) << 13; unsigned int32 xm = ((unsigned int32) hm) << 13;
return floatbits(xs | xe | xm); return floatbits(xs | xe | xm);
}
} }
static inline uniform int16 float_to_half_fast(uniform float f) { static inline uniform int16 float_to_half_fast(uniform float f) {
if (__have_native_half) {
return __float_to_half_uniform(f);
}
else {
uniform int32 x = intbits(f); uniform int32 x = intbits(f);
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
@@ -3123,8 +3155,13 @@ static inline uniform int16 float_to_half_fast(uniform float f) {
return (int16)ret; return (int16)ret;
} }
}
static inline int16 float_to_half_fast(float f) { static inline int16 float_to_half_fast(float f) {
if (__have_native_half) {
return __float_to_half_varying(f);
}
else {
int32 x = intbits(f); int32 x = intbits(f);
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
@@ -3143,6 +3180,7 @@ static inline int16 float_to_half_fast(float f) {
return (int16)ret; return (int16)ret;
} }
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// RNG stuff // RNG stuff

21
tests/half-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_v(uniform float RET[]) {
int errors = 0;
foreach (i = 0 ... 65535) {
unsigned int16 h = i;
float f = half_to_float(i);
h = float_to_half(f);
int mismatches = (f == f && i != h);
errors += reduce_add(mismatches);
}
RET[programIndex] = errors;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}