From 28b49837fc5604951b03bf1134601e76893373c5 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskiy <vsevolod.livinskij@frtk.ru>
Date: Thu, 21 May 2015 16:53:18 +0300
Subject: [PATCH] round2to16 was added

---
 builtins/target-sse4-8.ll | 10 +++-------
 builtins/util.m4          | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index e7fdb8f2..0688ad33 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -143,21 +143,17 @@ define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly always
 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
 
 define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
-;  XXXround2to4double(%0, 8)
-  ; FIXME: need round2to16double in util.m4...
-  ret <16 x double> undef  
+    round2to16double(%0, 8)
 }
 
 define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
   ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-;  XXXround2to4double(%0, 9)
-  ret <16 x double> undef  
+    round2to16double(%0, 9)
 }
 
 define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
   ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-;  XXXround2to4double(%0, 10)
-  ret <16 x double> undef  
+    round2to16double(%0, 10)
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/util.m4 b/builtins/util.m4
index 8951605b..f22705d0 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1130,6 +1130,42 @@ ret <8 x double> %ret
 '
 )
 
+define(`round2to16double', `
+%v0 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 0,  i32 1>
+%v1 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 2,  i32 3>
+%v2 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 4,  i32 5>
+%v3 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 6,  i32 7>
+%v4 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 8,  i32 9>
+%v5 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 10, i32 11>
+%v6 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 12, i32 13>
+%v7 = shufflevector <16 x double> $1, <16 x double> undef, <2 x i32> <i32 14, i32 15>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2)
+%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2)
+%r4 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v4, i32 $2)
+%r5 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v5, i32 $2)
+%r6 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v6, i32 $2)
+%r7 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v7, i32 $2)
+%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret01 = shufflevector <4 x double> %ret0, <4 x double> %ret1,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret2 = shufflevector <2 x double> %r4, <2 x double> %r5,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret3 = shufflevector <2 x double> %r6, <2 x double> %r7,
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret23 = shufflevector <4 x double> %ret2, <4 x double> %ret3,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x double> %ret01, <8 x double> %ret23,
+          <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                      i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x double> %ret
+'
+)
+
 define(`round4to16double', `
 %v0 = shufflevector <16 x double> $1, <16 x double> undef,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>