[AVX512]: uniform float/double round/ceil/floor

This commit is contained in:
Anton Mitrokhin
2015-05-07 16:26:09 +03:00
parent 2110708c8e
commit ef9c98fba8

View File

@@ -107,17 +107,86 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; math
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats
declare void @__fastmath() nounwind
;; round/floor/ceil
declare float @__round_uniform_float(float) nounwind readnone
declare float @__floor_uniform_float(float) nounwind readnone
declare float @__ceil_uniform_float(float) nounwind readnone
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
declare double @__round_uniform_double(double) nounwind readnone
declare double @__floor_uniform_double(double) nounwind readnone
declare double @__ceil_uniform_double(double) nounwind readnone
define float @__round_uniform_float(float) nounwind readonly alwaysinline {
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
; the roundss intrinsic is a total mess--docs say:
;
; __m128 _mm_round_ss (__m128 a, __m128 b, const int c)
;
; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function
; on b0. The higher order 96 bits are copied directly from input parameter a. The
; return value is described by the following equations:
;
; r0 = RND(b0)
; r1 = a1
; r2 = a2
; r3 = a3
;
; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
%rs = extractelement <4 x float> %xr, i32 0
ret float %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding doubles
declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
define double @__round_uniform_double(double) nounwind readonly alwaysinline {
%xi = insertelement <2 x double> undef, double %0, i32 0
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0
; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0
ret double %rs
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;