From ef9c98fba8f251d262e4ad37e10c29b1a3e8232d Mon Sep 17 00:00:00 2001 From: Anton Mitrokhin Date: Thu, 7 May 2015 16:26:09 +0300 Subject: [PATCH] [AVX512]: uniform float/double round/ceil/floor --- builtins/target-avx512-common.ll | 81 +++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 6 deletions(-) diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 34c94dba..877827a2 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -107,17 +107,86 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + declare void @__fastmath() nounwind ;; round/floor/ceil -declare float @__round_uniform_float(float) nounwind readnone -declare float @__floor_uniform_float(float) nounwind readnone -declare float @__ceil_uniform_float(float) nounwind readnone +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone -declare double @__round_uniform_double(double) nounwind readnone -declare double @__floor_uniform_double(double) nounwind readnone -declare double @__ceil_uniform_double(double) nounwind readnone +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + ; the roundss intrinsic is a total mess--docs say: + ; + ; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) + ; + ; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function + ; on b0. The higher order 96 bits are copied directly from input parameter a. The + ; return value is described by the following equations: + ; + ; r0 = RND(b0) + ; r1 = a1 + ; r2 = a2 + ; r3 = a3 + ; + ; It doesn't matter what we pass as a, since we only need the r0 value + ; here. So we pass the same register for both. Further, only the 0th + ; element of the b parameter matters + %xi = insertelement <4 x float> undef, float %0, i32 0 + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) + %rs = extractelement <4 x float> %xr, i32 0 + ret float %rs +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <4 x float> undef, float %0, i32 0 + ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) + %rs = extractelement <4 x float> %xr, i32 0 + ret float %rs +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <4 x float> undef, float %0, i32 0 + ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) + %rs = extractelement <4 x float> %xr, i32 0 + ret float %rs +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define double @__round_uniform_double(double) nounwind readonly alwaysinline { + %xi = insertelement <2 x double> undef, double %0, i32 0 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define double @__floor_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;