From 42c148bf75bf8efc498370c51a90e71010bdb725 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Fri, 29 Nov 2013 03:33:40 +0400
Subject: [PATCH] Changes for sse2 and sse4 in saturation

---
 builtins/target-avx-common.ll  |  97 --------------------------
 builtins/target-sse2-common.ll |  48 +------------
 builtins/target-sse2-x2.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse2.ll        | 122 +++++++++++++++++++++++++++++++++
 builtins/target-sse4-16.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse4-8.ll      |  43 ++++++++++++
 builtins/target-sse4-common.ll |  49 +------------
 builtins/target-sse4-x2.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse4.ll        | 122 +++++++++++++++++++++++++++++++++
 9 files changed, 535 insertions(+), 192 deletions(-)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index d47145f2..dcca74f0 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -41,103 +41,6 @@ define_prefetches()
 define_shuffles()
 aossoa()
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
-
-declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index a1e6f915..a1fec300 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -36,7 +36,7 @@ aossoa()
 rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
+;;scalar saturation arithmetic
 
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
@@ -44,94 +44,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) {
   ret i8 %ret
 }
 
-define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__padds_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__paddus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__paddus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubs_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubs_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 77bf1a9d..0f3eb275 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,88 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index e42d4990..1409e31d 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,128 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 72b81ff0..0ba62ac9 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -44,6 +44,88 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 69b355e3..6f00aa83 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -44,6 +44,49 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 70acca63..e33dbf01 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -39,7 +39,7 @@ aossoa()
 rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
+;;scalar saturation arithmetic
 
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
@@ -47,95 +47,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) {
   ret i8 %ret
 }
 
-define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__padds_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__paddus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__paddus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubs_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubs_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 842db53f..5c330e51 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,88 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 16177b47..0478ab2c 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,128 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines