From 19f73b2ede4fc2142299af13a7f24d34b6cf153a Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Mon, 25 Nov 2013 19:16:02 +0400
Subject: [PATCH 01/16] uniform signed/unsigned int8/16

---
 builtins.cpp                   |  9 ++++++
 builtins/target-sse2-common.ll | 58 ++++++++++++++++++++++++++++++++++
 stdlib.ispc                    | 35 ++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/builtins.cpp b/builtins.cpp
index 2afd92d9..c001318a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -488,12 +488,21 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
+        "__padds_i8",
+        "__padds_i16",
+        "__vpadds_i8",
+        "__paddus_i8",
+        "__paddus_i16",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
         "__prefetch_read_uniform_2",
         "__prefetch_read_uniform_3",
         "__prefetch_read_uniform_nt",
+        "__psubs_i8",
+        "__psubs_i16",
+        "__psubus_i8",
+        "__psubus_i16",
         "__rcp_uniform_float",
         "__rcp_varying_float",
         "__rdrand_i16",
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index ad1d88bc..070912ea 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -35,6 +35,64 @@ define_shuffles()
 aossoa()
 rdrand_decls()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; saturation arithmetic
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: vpaddsb
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__padds_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__padds_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__paddus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__paddus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubs_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubs_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
+  ret i16 %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/stdlib.ispc b/stdlib.ispc
index 6768594b..464da5d4 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4257,6 +4257,41 @@ static inline void fastmath() {
     __fastmath();
 }
 
+///////////////////////////////////////////////////////////////////////////
+// saturation arithmetic
+
+static inline uniform int8 padds(uniform int8 a, uniform int8 b) {
+    return __padds_i8(a, b);
+}
+
+static inline uniform int16 padds(uniform int16 a, uniform int16 b) {
+    return __padds_i16(a, b);
+}
+
+static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) {
+    return __paddus_i8(a, b);
+}
+
+static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) {
+    return __paddus_i16(a, b);
+}
+
+static inline uniform int8 psubs(uniform int8 a, uniform int8 b) {
+    return __psubs_i8(a, b);
+}
+
+static inline uniform int16 psubs(uniform int16 a, uniform int16 b) {
+    return __psubs_i16(a, b);
+}
+
+static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) {
+    return __psubus_i8(a, b);
+}
+
+static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) {
+    return __psubus_i16(a, b);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // rdrand
 

From 35a4d1b3a27b88dce6f6c7e9e6eeef7bc5fbdf99 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Wed, 27 Nov 2013 00:55:57 +0400
Subject: [PATCH 02/16] Add some AVX2 intrinsics

---
 builtins.cpp                   |  9 +++-
 builtins/target-avx-common.ll  | 96 +++++++++++++++++++++++++++++++++
 builtins/target-sse2-common.ll | 55 ++++++++++++++++---
 builtins/target-sse4-common.ll | 98 ++++++++++++++++++++++++++++++++++
 stdlib.ispc                    | 47 ++++++++++++++--
 5 files changed, 292 insertions(+), 13 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index c001318a..c6828a00 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -490,9 +490,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__packed_store_active",
         "__padds_i8",
         "__padds_i16",
-        "__vpadds_i8",
+        "__padds_vi8",
+        "__padds_vi16",
         "__paddus_i8",
         "__paddus_i16",
+        "__paddus_vi8",
+        "__paddus_vi16",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
@@ -501,8 +504,12 @@ lSetInternalFunctions(llvm::Module *module) {
         "__prefetch_read_uniform_nt",
         "__psubs_i8",
         "__psubs_i16",
+        "__psubs_vi8",
+        "__psubs_vi16",
         "__psubus_i8",
         "__psubus_i16",
+        "__psubus_vi8",
+        "__psubus_vi16",
         "__rcp_uniform_float",
         "__rcp_varying_float",
         "__rdrand_i16",
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 1c467476..d47145f2 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -41,7 +41,103 @@ define_prefetches()
 define_shuffles()
 aossoa()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; saturation arithmetic
 
+declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+define i8 @__padds_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+define i16 @__padds_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+define i8 @__paddus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+define i16 @__paddus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+define i8 @__psubs_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+define i16 @__psubs_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+define i8 @__psubus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+  ret <32 x i8> %res
+}
+
+
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
+define i16 @__psubus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index 070912ea..a1e6f915 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -37,62 +37,101 @@ rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; saturation arithmetic
+
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpaddsb
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
-;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
   ret i8 %ret
 }
 
+define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__padds_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret i16 %ret
 }
 
+define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__paddus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
   ret i8 %ret
 }
 
+define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__paddus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
   ret i16 %ret
 }
 
+define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubs_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
   ret i8 %ret
 }
 
+define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubs_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret i16 %ret
 }
 
+define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
   ret i8 %ret
 }
 
+define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
   ret i16 %ret
 }
 
+define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 50dd0582..70acca63 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -38,6 +38,104 @@ define_shuffles()
 aossoa()
 rdrand_decls()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; saturation arithmetic
+
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__padds_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__padds_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__paddus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__paddus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubs_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubs_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/stdlib.ispc b/stdlib.ispc
index 464da5d4..5b3d144c 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4264,34 +4264,73 @@ static inline uniform int8 padds(uniform int8 a, uniform int8 b) {
     return __padds_i8(a, b);
 }
 
+static inline varying int8 padds(varying int8 a, varying int8 b) {
+    return __padds_vi8(a, b);
+}
+
 static inline uniform int16 padds(uniform int16 a, uniform int16 b) {
     return __padds_i16(a, b);
 }
 
-static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) {
+static inline varying int16 padds(varying int16 a, varying int16 b) {
+    return __padds_vi16(a, b);
+}
+
+static inline uniform unsigned int8 paddus(uniform unsigned int8 a, 
+                                           uniform unsigned int8 b) {
     return __paddus_i8(a, b);
 }
 
-static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) {
+static inline varying unsigned int8 paddus(varying unsigned int8 a, 
+                                           varying unsigned int8 b) {
+    return __paddus_vi8(a, b);
+}
+
+static inline uniform unsigned int16 paddus(uniform unsigned int16 a, 
+                                            unsigned uniform int16 b) {
     return __paddus_i16(a, b);
 }
 
+static inline varying unsigned int16 paddus(varying unsigned int16 a, 
+                                            unsigned varying int16 b) {
+    return __paddus_vi16(a, b);
+}
+
 static inline uniform int8 psubs(uniform int8 a, uniform int8 b) {
     return __psubs_i8(a, b);
 }
 
+static inline varying int8 psubs(varying int8 a, varying int8 b) {
+    return __psubs_vi8(a, b);
+}
+
 static inline uniform int16 psubs(uniform int16 a, uniform int16 b) {
     return __psubs_i16(a, b);
 }
 
-static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) {
+static inline varying int16 psubs(varying int16 a, varying int16 b) {
+    return __psubs_vi16(a, b);
+}
+
+static inline uniform unsigned int8 psubus(uniform unsigned int8 a, 
+                                           uniform unsigned int8 b) {
     return __psubus_i8(a, b);
 }
 
-static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) {
+static inline varying unsigned int8 psubus(varying unsigned int8 a, 
+                                           varying unsigned int8 b) {
+    return __psubus_vi8(a, b);
+}
+
+static inline uniform unsigned int16 psubus(uniform unsigned int16 a, 
+                                            unsigned uniform int16 b) {
     return __psubus_i16(a, b);
 }
 
+static inline varying unsigned int16 psubus(varying unsigned int16 a, 
+                                            unsigned varying int16 b) {
+    return __psubus_vi16(a, b);
+}
 ///////////////////////////////////////////////////////////////////////////
 // rdrand
 

From 42c148bf75bf8efc498370c51a90e71010bdb725 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Fri, 29 Nov 2013 03:33:40 +0400
Subject: [PATCH 03/16] Changes for sse2 and sse4 in saturation

---
 builtins/target-avx-common.ll  |  97 --------------------------
 builtins/target-sse2-common.ll |  48 +------------
 builtins/target-sse2-x2.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse2.ll        | 122 +++++++++++++++++++++++++++++++++
 builtins/target-sse4-16.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse4-8.ll      |  43 ++++++++++++
 builtins/target-sse4-common.ll |  49 +------------
 builtins/target-sse4-x2.ll     |  82 ++++++++++++++++++++++
 builtins/target-sse4.ll        | 122 +++++++++++++++++++++++++++++++++
 9 files changed, 535 insertions(+), 192 deletions(-)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index d47145f2..dcca74f0 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -41,103 +41,6 @@ define_prefetches()
 define_shuffles()
 aossoa()
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
-
-declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
-
-
-declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) {
-  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
-  ret <32 x i8> %res
-}
-
-
-declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
-  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
-  ret <16 x i16> %res
-}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index a1e6f915..a1fec300 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -36,7 +36,7 @@ aossoa()
 rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
+;;scalar saturation arithmetic
 
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
@@ -44,94 +44,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) {
   ret i8 %ret
 }
 
-define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__padds_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__paddus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__paddus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubs_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubs_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 77bf1a9d..0f3eb275 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -47,6 +47,88 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index e42d4990..1409e31d 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -44,6 +44,128 @@ int64minmax()
 
 include(`target-sse2-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 72b81ff0..0ba62ac9 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -44,6 +44,88 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 69b355e3..6f00aa83 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -44,6 +44,49 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 70acca63..e33dbf01 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -39,7 +39,7 @@ aossoa()
 rdrand_decls()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; saturation arithmetic
+;;scalar saturation arithmetic
 
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
@@ -47,95 +47,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) {
   ret i8 %ret
 }
 
-define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__padds_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__paddus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__paddus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubs_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubs_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__psubus_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
   ret i8 %ret
 }
 
-define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define i16 @__psubus_i16(i16 %a0, i16 %a1) {
   sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
   ret i16 %ret
 }
 
-define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 842db53f..5c330e51 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -47,6 +47,88 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 16177b47..0478ab2c 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -44,6 +44,128 @@ int64minmax()
 
 include(`target-sse4-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 

From bec66623383e44deef6ef80209a97f33588c65b1 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Fri, 29 Nov 2013 03:45:25 +0400
Subject: [PATCH 04/16] Some cganges for avx1 and avx1.1 in saturation

---
 builtins/target-avx-common.ll     |  51 +++++++++++++
 builtins/target-avx-x2.ll         |  43 +++++++++++
 builtins/target-avx.ll            |  82 ++++++++++++++++++++
 builtins/target-avx1-i64x4base.ll | 122 ++++++++++++++++++++++++++++++
 4 files changed, 298 insertions(+)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index dcca74f0..d5eac54f 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -41,6 +41,57 @@ define_prefetches()
 define_shuffles()
 aossoa()
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;scalar saturation arithmetic
+
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__padds_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__padds_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__paddus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__paddus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubs_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubs_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
+  ret i16 %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index f8fd5cd5..694afe35 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -43,6 +43,49 @@ int64minmax()
 
 include(`target-avx-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e98a3843..a5a497d0 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -43,6 +43,88 @@ int64minmax()
 
 include(`target-avx-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index e1832030..831ae0e5 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -43,6 +43,128 @@ int64minmax()
 
 include(`target-avx-common.ll')
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;vector saturation arithmetic
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
+                i32 undef, i32 undef, i32 undef, i32 undef, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <WIDTH x i16> %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 

From 4c330bc38bdfaace8ac8bfcac7cea92b1ca2ebdb Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Fri, 29 Nov 2013 18:40:04 +0400
Subject: [PATCH 05/16] Add code generation of saturation

---
 builtins/target-avx.ll            |  53 ++++-----------
 builtins/target-avx1-i64x4base.ll | 105 +++++++-----------------------
 builtins/target-sse2-x2.ll        |  53 ++++-----------
 builtins/target-sse2.ll           | 105 +++++++-----------------------
 builtins/target-sse4-16.ll        |  53 ++++-----------
 builtins/target-sse4-x2.ll        |  53 ++++-----------
 builtins/target-sse4.ll           | 105 +++++++-----------------------
 builtins/util.m4                  |  52 +++++++++++++++
 8 files changed, 179 insertions(+), 400 deletions(-)

diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index a5a497d0..c56ec67d 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -45,18 +45,12 @@ include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -66,17 +60,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -86,17 +73,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -106,17 +86,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r    
 }
 
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index 831ae0e5..de26a29e 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -45,123 +45,68 @@ include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)    
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)  
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 0f3eb275..d59513b3 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -49,18 +49,12 @@ include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -70,17 +64,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -90,17 +77,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -110,17 +90,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r    
 }
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 1409e31d..11c51f70 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -46,123 +46,68 @@ include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)    
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)  
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 0ba62ac9..156cccab 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -46,18 +46,12 @@ include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -67,17 +61,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -87,17 +74,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -107,17 +87,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r    
 }
 
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 5c330e51..1f4f8332 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -49,18 +49,12 @@ include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -70,17 +64,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -90,17 +77,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
@@ -110,17 +90,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  convert16to8(i8, %r16, %r)
   ret <WIDTH x i8> %r    
 }
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 0478ab2c..2f6ebf6a 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -46,123 +46,68 @@ include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;vector saturation arithmetic
+
 define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)    
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)  
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
 define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  %v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
-    <16 x i32> <i32 0, i32 1, i32 2, i32 3,
-                i32 undef, i32 undef, i32 undef, i32 undef, 
-                i32 undef, i32 undef, i32 undef, i32 undef,
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert16to4(i8, %r16, %r)
   ret <WIDTH x i8> %r
 }
 
 define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  %v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
-  %v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
-    <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
-                i32 undef, i32 undef, i32 undef, i32 undef>
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  %r = shufflevector <8 x i16> %r16, <8 x i16> undef,
-    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  convert8to4(i16, %r16, %r)
   ret <WIDTH x i16> %r
 }
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index e1c9bf97..5f75d23a 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,58 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector convertation utilities
+;; convert 4-wide vector into 8-wide vector
+;;
+;; $1: vector element type
+;; $2: 4-wide vector
+;; $3: 8-wide vector
+
+define(`convert4to8', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+             i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert4to16', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert8to16', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+  <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+;; convert 4-wide vector into 8-wide vector
+;;
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: 4-wide vector
+
+define(`convert8to4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+
+define(`convert16to4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert16to8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+  <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; vector deconstruction utilities
 ;; split 8-wide vector into 2 4-wide vectors
 ;;

From 4faff1a63cbe9970f895d96518583897f2f8eb66 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Sat, 30 Nov 2013 10:48:18 +0400
Subject: [PATCH 06/16] structural change

---
 builtins/target-avx-common.ll     |  52 +-----
 builtins/target-avx-x2.ll         |  44 +----
 builtins/target-avx.ll            |  56 +-----
 builtins/target-avx1-i64x4base.ll |  68 +-------
 builtins/target-sse2-common.ll    |  52 +-----
 builtins/target-sse2-x2.ll        |  56 +-----
 builtins/target-sse2.ll           |  68 +-------
 builtins/target-sse4-16.ll        |  56 +-----
 builtins/target-sse4-8.ll         |  44 +----
 builtins/target-sse4-common.ll    |  52 +-----
 builtins/target-sse4-x2.ll        |  56 +-----
 builtins/target-sse4.ll           |  68 +-------
 builtins/util.m4                  | 273 ++++++++++++++++++++++++++++++
 13 files changed, 285 insertions(+), 660 deletions(-)

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index d5eac54f..32157a77 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -40,57 +40,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;scalar saturation arithmetic
-
-declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
-  ret i16 %ret
-}
+saturation_arithmetic_scalar()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 694afe35..cde63e7b 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -40,52 +40,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec16()
 
 include(`target-avx-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index c56ec67d..8f20bfed 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -40,64 +40,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec8()
 
 include(`target-avx-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r    
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index de26a29e..a2d292f2 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -40,76 +40,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec4()
 
 include(`target-avx-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)    
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)  
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index a1fec300..b5c5559c 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -34,57 +34,7 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;scalar saturation arithmetic
-
-declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
-  ret i16 %ret
-}
+saturation_arithmetic_scalar()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index d59513b3..b4b52d91 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -44,64 +44,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec8()
 
 include(`target-sse2-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r    
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 11c51f70..bdf6f848 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -41,76 +41,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec4()
 
 include(`target-sse2-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)    
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)  
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 156cccab..1c0b045a 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -41,64 +41,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec8()
 
 include(`target-sse4-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r    
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 6f00aa83..49351856 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -41,52 +41,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec16()
 
 include(`target-sse4-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index e33dbf01..8eeaa413 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -37,57 +37,7 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;scalar saturation arithmetic
-
-declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
-  ret i16 %ret
-}
+saturation_arithmetic_scalar()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 1f4f8332..2cd0ea4d 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -44,64 +44,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec8()
 
 include(`target-sse4-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert8to16(i8, %0, %v0)
-  convert8to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r    
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 2f6ebf6a..96effe39 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -41,76 +41,10 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic_vec4()
 
 include(`target-sse4-common.ll')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;vector saturation arithmetic
-
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)    
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)  
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert4to16(i8, %0, %v0)
-  convert4to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert4to8(i16, %0, %v0)
-  convert4to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 5f75d23a..0d5ed2de 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -77,6 +77,42 @@ define(`convert8to16', `
               i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 
+define(`convert4to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert8to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 4, i32 5, i32 6, i32 7,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert16to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32  0, i32 1,  i32  2, i32  3, 
+              i32  4, i32 5,  i32  6, i32  7,
+              i32  8, i32 9,  i32 10, i32 11,
+              i32 12, i32 13, i32 14, i32 15 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
 ;; convert 4-wide vector into 8-wide vector
 ;;
 ;; $1: vector element type
@@ -99,6 +135,243 @@ define(`convert16to8', `
   <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ')
 
+define(`convert32to4', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert32to8', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert32to16', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;saturation arithmetic 
+;;scalar saturation arithmetic
+
+define(`saturation_arithmetic_scalar', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__padds_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__padds_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__paddus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__paddus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubs_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubs_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret i16 %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__psubus_i8(i8 %a0, i8 %a1) {
+  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
+  ret i8 %ret
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__psubus_i16(i16 %a0, i16 %a1) {
+  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
+  ret i16 %ret
+}
+')
+
+;;4-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec4', `
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)    
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)  
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+')
+
+;;8-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec8', `
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <WIDTH x i8> %r    
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
+  ret <WIDTH x i16> %res
+}
+')
+
+;;16-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec16', `
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
+  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <WIDTH x i8> %res
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
+  ret <WIDTH x i16> %ret
+}
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; vector deconstruction utilities

From 65768c20aec633b7c9f33b8c150169aeaca82c49 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Thu, 5 Dec 2013 00:34:14 +0400
Subject: [PATCH 07/16] Added tests for saturation and some fixes for generic
 and avx target

---
 builtins/target-avx.ll            |   1 -
 builtins/target-avx1.ll           |   1 +
 builtins/target-avx11.ll          |   1 +
 builtins/target-avx2.ll           |   1 +
 builtins/target-generic-1.ll      |   2 +
 builtins/target-generic-16.ll     |   2 +-
 builtins/target-generic-4.ll      |   2 +-
 builtins/target-generic-8.ll      |   2 +-
 builtins/target-generic-common.ll |   1 +
 builtins/util.m4                  | 102 +++++++++++++++++++++++++++++-
 stdlib.ispc                       |   8 +--
 tests/padds_i16.ispc              |  11 ++++
 tests/padds_i8.ispc               |  11 ++++
 tests/padds_vi16.ispc             |  11 ++++
 tests/padds_vi8.ispc              |  11 ++++
 tests/paddus_i16.ispc             |  11 ++++
 tests/paddus_i8.ispc              |  11 ++++
 tests/paddus_vi16.ispc            |  11 ++++
 tests/paddus_vi8.ispc             |  11 ++++
 tests/psubs_i16.ispc              |  11 ++++
 tests/psubs_i8.ispc               |  11 ++++
 tests/psubs_vi16.ispc             |  11 ++++
 tests/psubs_vi8.ispc              |  11 ++++
 tests/psubus_i16.ispc             |  11 ++++
 tests/psubus_i8.ispc              |  11 ++++
 tests/psubus_vi16.ispc            |  11 ++++
 tests/psubus_vi8.ispc             |  11 ++++
 27 files changed, 288 insertions(+), 11 deletions(-)
 create mode 100644 tests/padds_i16.ispc
 create mode 100644 tests/padds_i8.ispc
 create mode 100644 tests/padds_vi16.ispc
 create mode 100644 tests/padds_vi8.ispc
 create mode 100644 tests/paddus_i16.ispc
 create mode 100644 tests/paddus_i8.ispc
 create mode 100644 tests/paddus_vi16.ispc
 create mode 100644 tests/paddus_vi8.ispc
 create mode 100644 tests/psubs_i16.ispc
 create mode 100644 tests/psubs_i8.ispc
 create mode 100644 tests/psubs_vi16.ispc
 create mode 100644 tests/psubs_vi8.ispc
 create mode 100644 tests/psubus_i16.ispc
 create mode 100644 tests/psubus_i8.ispc
 create mode 100644 tests/psubus_vi16.ispc
 create mode 100644 tests/psubus_vi8.ispc

diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 8f20bfed..e98a3843 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -40,7 +40,6 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec8()
 
 include(`target-avx-common.ll')
 
diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll
index 9c86cab8..f0cf1efb 100644
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -32,6 +32,7 @@
 include(`target-avx.ll')
 
 rdrand_decls()
+saturation_arithmetic_vec8()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index fea0a7c2..706314a5 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -34,6 +34,7 @@ include(`target-avx.ll')
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
        LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
+saturation_arithmetic_vec8()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index f4a0ee07..c5f8e84f 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -38,6 +38,7 @@ include(`target-avx.ll')
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
        LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
+saturation_arithmetic_vec8()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 910565dd..bb974932 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -9,6 +9,8 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+saturation_arithmetic_scalar()
+saturation_arithmetic_novec()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll
index 807fd242..36a2ee4c 100644
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`16')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_vec16()
diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll
index 7eb1f300..a7e8dcaa 100644
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`4')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_vec4()
diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll
index bd9261ff..b692322e 100644
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`8')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_vec8()
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 92b7a18e..c4d3b950 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -41,6 +41,7 @@ stdlib_core()
 scans()
 reduce_equal(WIDTH)
 rdrand_decls()
+saturation_arithmetic_scalar()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
diff --git a/builtins/util.m4 b/builtins/util.m4
index 0d5ed2de..e0f7aaec 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -50,12 +50,28 @@ define(`MASK_HIGH_BIT_ON',
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; vector convertation utilities
-;; convert 4-wide vector into 8-wide vector
+;; convert 1-wide vector into 8-wide vector
 ;;
 ;; $1: vector element type
-;; $2: 4-wide vector
+;; $2: 1-wide vector
 ;; $3: 8-wide vector
 
+
+define(`convert1to8', `
+  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
+  <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
+             i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+
+define(`convert1to16', `
+  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
+  <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
 define(`convert4to8', `
   $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
   <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
@@ -117,7 +133,19 @@ define(`convert16to32', `
 ;;
 ;; $1: vector element type
 ;; $2: 8-wide vector
-;; $3: 4-wide vector
+;; $3: 1-wide vector
+
+
+define(`convert8to1', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <1 x i32> <i32 0>
+')
+
+
+define(`convert16to1', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <1 x i32> <i32 0>
+')
 
 define(`convert8to4', `
   $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
@@ -204,6 +232,74 @@ define i16 @__psubus_i16(i16 %a0, i16 %a1) {
 }
 ')
 
+;;no vector saturation arithmetic
+
+define(`saturation_arithmetic_novec', `
+define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert1to16(i8, %0, %v0)
+  convert1to16(i8, %1, %v1)    
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to1(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert1to8(i16, %0, %v0)
+  convert1to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to1(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert1to16(i8, %0, %v0)
+  convert1to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to1(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert1to8(i16, %0, %v0)
+  convert1to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to1(i16, %r16, %r)  
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert1to16(i8, %0, %v0)
+  convert1to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to1(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert1to8(i16, %0, %v0)
+  convert1to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to1(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+
+define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  convert1to16(i8, %0, %v0)
+  convert1to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to1(i8, %r16, %r)
+  ret <WIDTH x i8> %r
+}
+
+define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  convert1to8(i16, %0, %v0)
+  convert1to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to1(i16, %r16, %r)
+  ret <WIDTH x i16> %r
+}
+')
+
 ;;4-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec4', `
diff --git a/stdlib.ispc b/stdlib.ispc
index 5b3d144c..9e296687 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4287,12 +4287,12 @@ static inline varying unsigned int8 paddus(varying unsigned int8 a,
 }
 
 static inline uniform unsigned int16 paddus(uniform unsigned int16 a, 
-                                            unsigned uniform int16 b) {
+                                            uniform unsigned int16 b) {
     return __paddus_i16(a, b);
 }
 
 static inline varying unsigned int16 paddus(varying unsigned int16 a, 
-                                            unsigned varying int16 b) {
+                                            varying unsigned int16 b) {
     return __paddus_vi16(a, b);
 }
 
@@ -4323,12 +4323,12 @@ static inline varying unsigned int8 psubus(varying unsigned int8 a,
 }
 
 static inline uniform unsigned int16 psubus(uniform unsigned int16 a, 
-                                            unsigned uniform int16 b) {
+                                            uniform unsigned int16 b) {
     return __psubus_i16(a, b);
 }
 
 static inline varying unsigned int16 psubus(varying unsigned int16 a, 
-                                            unsigned varying int16 b) {
+                                            varying unsigned int16 b) {
     return __psubus_vi16(a, b);
 }
 ///////////////////////////////////////////////////////////////////////////
diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc
new file mode 100644
index 00000000..4668071b
--- /dev/null
+++ b/tests/padds_i16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a = 32767, b = 32767; // max signed int16
+    RET[programIndex] = padds(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 32767;
+}
diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc
new file mode 100644
index 00000000..81da8a21
--- /dev/null
+++ b/tests/padds_i8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = 127, b = 127; // max signed int8
+    RET[programIndex] = padds(a1, b1);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 127;
+}
diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc
new file mode 100644
index 00000000..7c6848e7
--- /dev/null
+++ b/tests/padds_vi16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = 32767, b = 32767; // max signed int16
+    RET[programIndex] = padds(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 32767;
+}
diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc
new file mode 100644
index 00000000..5d6196be
--- /dev/null
+++ b/tests/padds_vi8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = 127, b = 127; // max signed int8
+    RET[programIndex] = padds(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 127;
+}
diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc
new file mode 100644
index 00000000..d2939677
--- /dev/null
+++ b/tests/paddus_i16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int16 a = 65535, b = 65535; // max unsigned int16
+    RET[programIndex] = paddus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 65535;
+}
diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc
new file mode 100644
index 00000000..23de8c21
--- /dev/null
+++ b/tests/paddus_i8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = 255, b = 255; // max unsigned int8
+    RET[programIndex] = paddus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 255;
+}
diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc
new file mode 100644
index 00000000..803259f5
--- /dev/null
+++ b/tests/paddus_vi16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = 65535, b = 65535; // max unsigned int16
+    RET[programIndex] = paddus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 65535;
+}
diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc
new file mode 100644
index 00000000..3d7d3509
--- /dev/null
+++ b/tests/paddus_vi8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = 255, b = 255; // max unsigned int8
+    RET[programIndex] = paddus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 255;
+}
diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc
new file mode 100644
index 00000000..9038215e
--- /dev/null
+++ b/tests/psubs_i16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = -32768, b = 32767; // min and max signed int16
+    RET[programIndex] = psubs(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -32768;
+}
diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc
new file mode 100644
index 00000000..1a661520
--- /dev/null
+++ b/tests/psubs_i8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = -128, b = 127; // min and max signed int8
+    RET[programIndex] = psubs(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -128;
+}
diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc
new file mode 100644
index 00000000..b1e2cf48
--- /dev/null
+++ b/tests/psubs_vi16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = -32768, b = 32767; // min and max unsigned int16
+    RET[programIndex] = psubs(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -32768;
+}
diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc
new file mode 100644
index 00000000..a6148a3f
--- /dev/null
+++ b/tests/psubs_vi8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = -128, b = 127; // min and max unsigned int8
+    RET[programIndex] = psubs(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -128;
+}
diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc
new file mode 100644
index 00000000..b31b250e
--- /dev/null
+++ b/tests/psubus_i16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = 0, b = 32767; // min and max unsigned int16
+    RET[programIndex] = psubus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc
new file mode 100644
index 00000000..c073d306
--- /dev/null
+++ b/tests/psubus_i8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 a = 0, b = 255; // min and max unsigned int8
+    RET[programIndex] = psubus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc
new file mode 100644
index 00000000..fd4db693
--- /dev/null
+++ b/tests/psubus_vi16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = 0, b = 32767; // min and max unsigned int16
+    RET[programIndex] = psubus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc
new file mode 100644
index 00000000..3c00308f
--- /dev/null
+++ b/tests/psubus_vi8.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = 0, b = 255; // min and max unsigned int8
+    RET[programIndex] = psubus(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}

From ea94658411fa3e2bfc11cb5c4b791a6143fe521f Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Fri, 6 Dec 2013 17:20:37 +0400
Subject: [PATCH 08/16] Some saturation tests fixes

---
 tests/padds_i8.ispc   | 2 +-
 tests/paddus_i8.ispc  | 2 +-
 tests/paddus_vi8.ispc | 2 +-
 tests/psubs_i16.ispc  | 2 +-
 tests/psubus_i16.ispc | 2 +-
 tests/psubus_i8.ispc  | 2 +-
 tests/psubus_vi8.ispc | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc
index 81da8a21..d7bdc8b6 100644
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform int8 a = 127, b = 127; // max signed int8
-    RET[programIndex] = padds(a1, b1);
+    RET[programIndex] = padds(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc
index 23de8c21..1c585369 100644
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = 255, b = 255; // max unsigned int8
+    uniform unsigned int8 a = 255, b = 255; // max unsigned int8
     RET[programIndex] = paddus(a, b);
 }
 
diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc
index 3d7d3509..c9d7a115 100644
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = 255, b = 255; // max unsigned int8
+    varying unsigned int8 a = 255, b = 255; // max unsigned int8
     RET[programIndex] = paddus(a, b);
 }
 
diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc
index 9038215e..d66f51ad 100644
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = -32768, b = 32767; // min and max signed int16
+    uniform int16 a = -32768, b = 32767; // min and max signed int16
     RET[programIndex] = psubs(a, b);
 }
 
diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc
index b31b250e..c02922d2 100644
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = 0, b = 32767; // min and max unsigned int16
+    uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16
     RET[programIndex] = psubus(a, b);
 }
 
diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc
index c073d306..a45e9f6e 100644
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = 0, b = 255; // min and max unsigned int8
+    uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8
     RET[programIndex] = psubus(a, b);
 }
 
diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc
index 3c00308f..46005204 100644
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = 0, b = 255; // min and max unsigned int8
+    varying unsigned int8 a = 0, b = 255; // min and max unsigned int8
     RET[programIndex] = psubus(a, b);
 }
 

From 9a135c48d95c84bde1a1038b0de6430087ca04d6 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Mon, 9 Dec 2013 00:20:52 +0400
Subject: [PATCH 09/16] Functions name change

---
 stdlib.ispc            | 48 +++++++++++++++++++++---------------------
 tests/padds_i16.ispc   |  2 +-
 tests/padds_i8.ispc    |  2 +-
 tests/padds_vi16.ispc  |  2 +-
 tests/padds_vi8.ispc   |  2 +-
 tests/paddus_i16.ispc  |  4 ++--
 tests/paddus_i8.ispc   |  2 +-
 tests/paddus_vi16.ispc |  4 ++--
 tests/paddus_vi8.ispc  |  2 +-
 tests/psubs_i16.ispc   |  2 +-
 tests/psubs_i8.ispc    |  2 +-
 tests/psubs_vi16.ispc  |  2 +-
 tests/psubs_vi8.ispc   |  2 +-
 tests/psubus_i16.ispc  |  2 +-
 tests/psubus_i8.ispc   |  2 +-
 tests/psubus_vi16.ispc |  4 ++--
 tests/psubus_vi8.ispc  |  2 +-
 17 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/stdlib.ispc b/stdlib.ispc
index 9e296687..487b4184 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -4260,75 +4260,75 @@ static inline void fastmath() {
 ///////////////////////////////////////////////////////////////////////////
 // saturation arithmetic
 
-static inline uniform int8 padds(uniform int8 a, uniform int8 b) {
+static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
     return __padds_i8(a, b);
 }
 
-static inline varying int8 padds(varying int8 a, varying int8 b) {
+static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
     return __padds_vi8(a, b);
 }
 
-static inline uniform int16 padds(uniform int16 a, uniform int16 b) {
+static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
     return __padds_i16(a, b);
 }
 
-static inline varying int16 padds(varying int16 a, varying int16 b) {
+static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
     return __padds_vi16(a, b);
 }
 
-static inline uniform unsigned int8 paddus(uniform unsigned int8 a, 
-                                           uniform unsigned int8 b) {
+static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, 
+                                                   uniform unsigned int8 b) {
     return __paddus_i8(a, b);
 }
 
-static inline varying unsigned int8 paddus(varying unsigned int8 a, 
-                                           varying unsigned int8 b) {
+static inline varying unsigned int8 saturating_add(varying unsigned int8 a, 
+                                                   varying unsigned int8 b) {
     return __paddus_vi8(a, b);
 }
 
-static inline uniform unsigned int16 paddus(uniform unsigned int16 a, 
-                                            uniform unsigned int16 b) {
+static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, 
+                                                    uniform unsigned int16 b) {
     return __paddus_i16(a, b);
 }
 
-static inline varying unsigned int16 paddus(varying unsigned int16 a, 
-                                            varying unsigned int16 b) {
+static inline varying unsigned int16 saturating_add(varying unsigned int16 a, 
+                                                    varying unsigned int16 b) {
     return __paddus_vi16(a, b);
 }
 
-static inline uniform int8 psubs(uniform int8 a, uniform int8 b) {
+static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
     return __psubs_i8(a, b);
 }
 
-static inline varying int8 psubs(varying int8 a, varying int8 b) {
+static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
     return __psubs_vi8(a, b);
 }
 
-static inline uniform int16 psubs(uniform int16 a, uniform int16 b) {
+static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
     return __psubs_i16(a, b);
 }
 
-static inline varying int16 psubs(varying int16 a, varying int16 b) {
+static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
     return __psubs_vi16(a, b);
 }
 
-static inline uniform unsigned int8 psubus(uniform unsigned int8 a, 
-                                           uniform unsigned int8 b) {
+static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, 
+                                                   uniform unsigned int8 b) {
     return __psubus_i8(a, b);
 }
 
-static inline varying unsigned int8 psubus(varying unsigned int8 a, 
-                                           varying unsigned int8 b) {
+static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, 
+                                                   varying unsigned int8 b) {
     return __psubus_vi8(a, b);
 }
 
-static inline uniform unsigned int16 psubus(uniform unsigned int16 a, 
-                                            uniform unsigned int16 b) {
+static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, 
+                                                    uniform unsigned int16 b) {
     return __psubus_i16(a, b);
 }
 
-static inline varying unsigned int16 psubus(varying unsigned int16 a, 
-                                            varying unsigned int16 b) {
+static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, 
+                                                    varying unsigned int16 b) {
     return __psubus_vi16(a, b);
 }
 ///////////////////////////////////////////////////////////////////////////
diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc
index 4668071b..930593ac 100644
--- a/tests/padds_i16.ispc
+++ b/tests/padds_i16.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform int16 a = 32767, b = 32767; // max signed int16
-    RET[programIndex] = padds(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc
index d7bdc8b6..6d72a61b 100644
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform int8 a = 127, b = 127; // max signed int8
-    RET[programIndex] = padds(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc
index 7c6848e7..b48d776a 100644
--- a/tests/padds_vi16.ispc
+++ b/tests/padds_vi16.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying int16 a = 32767, b = 32767; // max signed int16
-    RET[programIndex] = padds(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc
index 5d6196be..71d42cb8 100644
--- a/tests/padds_vi8.ispc
+++ b/tests/padds_vi8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying int8 a = 127, b = 127; // max signed int8
-    RET[programIndex] = padds(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc
index d2939677..968953fa 100644
--- a/tests/paddus_i16.ispc
+++ b/tests/paddus_i16.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int16 a = 65535, b = 65535; // max unsigned int16
-    RET[programIndex] = paddus(a, b);
+    uniform unsigned int16 a = 65535, b = 65535; // max unsigned int16
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc
index 1c585369..44c41a6c 100644
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform unsigned int8 a = 255, b = 255; // max unsigned int8
-    RET[programIndex] = paddus(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc
index 803259f5..4d15e49b 100644
--- a/tests/paddus_vi16.ispc
+++ b/tests/paddus_vi16.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = 65535, b = 65535; // max unsigned int16
-    RET[programIndex] = paddus(a, b);
+    varying unsigned int16 a = 65535, b = 65535; // max unsigned int16
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc
index c9d7a115..77fcec7a 100644
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying unsigned int8 a = 255, b = 255; // max unsigned int8
-    RET[programIndex] = paddus(a, b);
+    RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc
index d66f51ad..163af2da 100644
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform int16 a = -32768, b = 32767; // min and max signed int16
-    RET[programIndex] = psubs(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc
index 1a661520..1dba8fe3 100644
--- a/tests/psubs_i8.ispc
+++ b/tests/psubs_i8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform int8 a = -128, b = 127; // min and max signed int8
-    RET[programIndex] = psubs(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc
index b1e2cf48..3208e842 100644
--- a/tests/psubs_vi16.ispc
+++ b/tests/psubs_vi16.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying int16 a = -32768, b = 32767; // min and max unsigned int16
-    RET[programIndex] = psubs(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc
index a6148a3f..143aaf4e 100644
--- a/tests/psubs_vi8.ispc
+++ b/tests/psubs_vi8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying int8 a = -128, b = 127; // min and max unsigned int8
-    RET[programIndex] = psubs(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc
index c02922d2..bb62f03f 100644
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16
-    RET[programIndex] = psubus(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc
index a45e9f6e..176ecc33 100644
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8
-    RET[programIndex] = psubus(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc
index fd4db693..ca58f374 100644
--- a/tests/psubus_vi16.ispc
+++ b/tests/psubus_vi16.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = 0, b = 32767; // min and max unsigned int16
-    RET[programIndex] = psubus(a, b);
+    varying unsigned int16 a = 0, b = 32767; // min and max unsigned int16
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc
index 46005204..e730fd7e 100644
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     varying unsigned int8 a = 0, b = 255; // min and max unsigned int8
-    RET[programIndex] = psubus(a, b);
+    RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {

From 07c6f1714a3500c767d3736d850d8996bbcf11a2 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Sun, 22 Dec 2013 19:28:26 +0400
Subject: [PATCH 10/16] Some fixes in function names and more tests was added.

---
 builtins/target-avx-common.ll     |   2 +-
 builtins/target-avx-x2.ll         |   2 +-
 builtins/target-avx1-i64x4base.ll |   2 +-
 builtins/target-avx1.ll           |   2 +-
 builtins/target-avx11.ll          |   3 +-
 builtins/target-avx2.ll           |   3 +-
 builtins/target-generic-1.ll      |   2 +-
 builtins/target-generic-16.ll     |   2 +-
 builtins/target-generic-4.ll      |   2 +-
 builtins/target-generic-8.ll      |   2 +-
 builtins/target-generic-common.ll |   2 +-
 builtins/target-sse2-common.ll    |   2 +-
 builtins/target-sse2-x2.ll        |   2 +-
 builtins/target-sse2.ll           |   2 +-
 builtins/target-sse4-16.ll        |   2 +-
 builtins/target-sse4-8.ll         |   2 +-
 builtins/target-sse4-common.ll    |   2 +-
 builtins/target-sse4-x2.ll        |   2 +-
 builtins/target-sse4.ll           |   2 +-
 builtins/util.m4                  | 125 ++++++++++++++++--------------
 tests/padds_i16-2.ispc            |  11 +++
 tests/padds_i16.ispc              |   6 +-
 tests/padds_i8-2.ispc             |  11 +++
 tests/padds_i8.ispc               |   6 +-
 tests/padds_vi16-2.ispc           |  11 +++
 tests/padds_vi16.ispc             |   4 +-
 tests/padds_vi8-2.ispc            |  11 +++
 tests/padds_vi8.ispc              |   4 +-
 tests/paddus_i16.ispc             |   6 +-
 tests/paddus_i8.ispc              |   6 +-
 tests/paddus_vi16.ispc            |   4 +-
 tests/paddus_vi8.ispc             |   4 +-
 tests/psubs_i16-2.ispc            |  11 +++
 tests/psubs_i16.ispc              |   6 +-
 tests/psubs_i8-2.ispc             |  11 +++
 tests/psubs_i8.ispc               |   6 +-
 tests/psubs_vi16-2.ispc           |  11 +++
 tests/psubs_vi16.ispc             |   4 +-
 tests/psubs_vi8-2.ispc            |  11 +++
 tests/psubs_vi8.ispc              |   4 +-
 tests/psubus_i16.ispc             |   6 +-
 tests/psubus_i8.ispc              |   6 +-
 tests/psubus_vi16.ispc            |   4 +-
 tests/psubus_vi8.ispc             |   4 +-
 44 files changed, 215 insertions(+), 118 deletions(-)
 create mode 100644 tests/padds_i16-2.ispc
 create mode 100644 tests/padds_i8-2.ispc
 create mode 100644 tests/padds_vi16-2.ispc
 create mode 100644 tests/padds_vi8-2.ispc
 create mode 100644 tests/psubs_i16-2.ispc
 create mode 100644 tests/psubs_i8-2.ispc
 create mode 100644 tests/psubs_vi16-2.ispc
 create mode 100644 tests/psubs_vi8-2.ispc

diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index 32157a77..d6b577b8 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -40,7 +40,7 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
-saturation_arithmetic_scalar()
+saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index cde63e7b..8d3e29c8 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -40,7 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec16()
+saturation_arithmetic()
 
 include(`target-avx-common.ll')
 
diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
index a2d292f2..d9c60c26 100644
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -40,7 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec4()
+saturation_arithmetic()
 
 include(`target-avx-common.ll')
 
diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll
index f0cf1efb..a9ddc112 100644
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -32,7 +32,7 @@
 include(`target-avx.ll')
 
 rdrand_decls()
-saturation_arithmetic_vec8()
+saturation_arithmetic()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll
index 706314a5..c4c421a0 100644
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -34,7 +34,8 @@ include(`target-avx.ll')
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
        LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
-saturation_arithmetic_vec8()
+
+saturation_arithmetic()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll
index c5f8e84f..20ecef47 100644
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -38,7 +38,8 @@ include(`target-avx.ll')
 ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
        LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
        `rdrand_definition()')
-saturation_arithmetic_vec8()
+
+saturation_arithmetic()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index bb974932..af343496 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -9,7 +9,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
-saturation_arithmetic_scalar()
+saturation_arithmetic()
 saturation_arithmetic_novec()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll
index 36a2ee4c..df04187c 100644
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`16')
 include(`target-generic-common.ll')
-saturation_arithmetic_vec16()
+saturation_arithmetic()
diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll
index a7e8dcaa..e43f45c5 100644
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`4')
 include(`target-generic-common.ll')
-saturation_arithmetic_vec4()
+saturation_arithmetic()
diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll
index b692322e..6b87509d 100644
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`8')
 include(`target-generic-common.ll')
-saturation_arithmetic_vec8()
+saturation_arithmetic()
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index c4d3b950..6f5199d8 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -41,7 +41,7 @@ stdlib_core()
 scans()
 reduce_equal(WIDTH)
 rdrand_decls()
-saturation_arithmetic_scalar()
+saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index b5c5559c..d8a461aa 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -34,7 +34,7 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-saturation_arithmetic_scalar()
+saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index b4b52d91..1cb2abc4 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -44,7 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec8()
+saturation_arithmetic()
 
 include(`target-sse2-common.ll')
 
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index bdf6f848..ee8b533c 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -41,7 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec4()
+saturation_arithmetic()
 
 include(`target-sse2-common.ll')
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 1c0b045a..00ff2519 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -41,7 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec8()
+saturation_arithmetic()
 
 include(`target-sse4-common.ll')
 
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index 49351856..15c577e8 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -41,7 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec16()
+saturation_arithmetic()
 
 include(`target-sse4-common.ll')
 
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 8eeaa413..2dd5c149 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -37,7 +37,7 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-saturation_arithmetic_scalar()
+saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 2cd0ea4d..59a6942a 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -44,7 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec8()
+saturation_arithmetic()
 
 include(`target-sse4-common.ll')
 
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 96effe39..4762836d 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -41,7 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
-saturation_arithmetic_vec4()
+saturation_arithmetic()
 
 include(`target-sse4-common.ll')
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index e0f7aaec..6f36f71e 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -179,10 +179,17 @@ define(`convert32to16', `
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;saturation arithmetic 
-;;scalar saturation arithmetic
+;;saturation arithmetic
+ 
+define(`saturation_arithmetic',
+`ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
+        WIDTH,  `8', `saturation_arithmetic_vec8()',
+        WIDTH, `16', `saturation_arithmetic_vec16()',
+                     `saturation_arithmetic_uniform()')')
 
-define(`saturation_arithmetic_scalar', `
+;;uniform saturation arithmetic
+
+define(`saturation_arithmetic_uniform', `
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define i8 @__padds_i8(i8 %a0, i8 %a1) {
   sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
@@ -303,168 +310,168 @@ define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
 ;;4-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec4', `
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)    
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <4 x i8> %r
 }
 
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
   convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
+  ret <4 x i16> %r
 }
 
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <4 x i8> %r
 }
 
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
   convert8to4(i16, %r16, %r)  
-  ret <WIDTH x i16> %r
+  ret <4 x i16> %r
 }
 
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <4 x i8> %r
 }
 
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
   convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
+  ret <4 x i16> %r
 }
 
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to4(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <4 x i8> %r
 }
 
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
   %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
   convert8to4(i16, %r16, %r)
-  ret <WIDTH x i16> %r
+  ret <4 x i16> %r
 }
 ')
 
 ;;8-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec8', `
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <8 x i8> %r
 }
 
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
+define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
 }
 
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <8 x i8> %r
 }
 
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
+define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
 }
 
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r
+  ret <8 x i8> %r
 }
 
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
+define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
 }
 
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
   %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
   convert16to8(i8, %r16, %r)
-  ret <WIDTH x i8> %r    
+  ret <8 x i8> %r    
 }
 
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
-  %res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
-  ret <WIDTH x i16> %res
+define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
 }
 ')
 
 ;;16-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec16', `
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
+define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
 }
 
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
+  ret <16 x i16> %ret
 }
 
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
+define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
 }
 
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
+  ret <16 x i16> %ret
 }
 
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
+define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
 }
 
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret <WIDTH x i16> %ret
+  ret <16 x i16> %ret
 }
 
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
-  %res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <WIDTH x i8> %res
+define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
 }
 
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
+define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
-  ret <WIDTH x i16> %ret
+  ret <16 x i16> %ret
 }
 ')
 
diff --git a/tests/padds_i16-2.ispc b/tests/padds_i16-2.ispc
new file mode 100644
index 00000000..83234804
--- /dev/null
+++ b/tests/padds_i16-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a = -32768; // min signed int16
+    RET[programIndex] = saturating_add(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (uniform int16) -32768;
+}
diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc
index 930593ac..e5456416 100644
--- a/tests/padds_i16.ispc
+++ b/tests/padds_i16.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int16 a = 32767, b = 32767; // max signed int16
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a = 32767; // max signed int16
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 32767;
+    RET[programIndex] = (uniform int16) 32767;
 }
diff --git a/tests/padds_i8-2.ispc b/tests/padds_i8-2.ispc
new file mode 100644
index 00000000..9a303d70
--- /dev/null
+++ b/tests/padds_i8-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a = -128; // min signed int8
+    RET[programIndex] = saturating_add(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (uniform int8) -128;
+}
diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc
index 6d72a61b..bbcc4cc7 100644
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = 127, b = 127; // max signed int8
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a = 127; // max signed int8
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 127;
+    RET[programIndex] = (uniform int8) 127;
 }
diff --git a/tests/padds_vi16-2.ispc b/tests/padds_vi16-2.ispc
new file mode 100644
index 00000000..5f1eda37
--- /dev/null
+++ b/tests/padds_vi16-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = -32768, b = aFOO[programIndex]; // max signed int16
+    RET[programIndex] = saturating_add(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (varying int16) -32768;
+}
diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc
index b48d776a..e3bd0f51 100644
--- a/tests/padds_vi16.ispc
+++ b/tests/padds_vi16.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = 32767, b = 32767; // max signed int16
+    varying int16 a = 32767, b = aFOO[programIndex]; // max signed int16
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 32767;
+    RET[programIndex] = (varying int16) 32767;
 }
diff --git a/tests/padds_vi8-2.ispc b/tests/padds_vi8-2.ispc
new file mode 100644
index 00000000..e3302d18
--- /dev/null
+++ b/tests/padds_vi8-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = -128, b = aFOO[programIndex]; // max signed int8
+    RET[programIndex] = saturating_add(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (varying int8) -128;
+}
diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc
index 71d42cb8..df921414 100644
--- a/tests/padds_vi8.ispc
+++ b/tests/padds_vi8.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = 127, b = 127; // max signed int8
+    varying int8 a = 127, b = aFOO[programIndex]; // max signed int8
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 127;
+    RET[programIndex] = (varying int8) 127;
 }
diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc
index 968953fa..e38f6db7 100644
--- a/tests/paddus_i16.ispc
+++ b/tests/paddus_i16.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform unsigned int16 a = 65535, b = 65535; // max unsigned int16
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int16 a = 65535; // max unsigned int16
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 65535;
+    RET[programIndex] = (uniform unsigned int16) 65535;
 }
diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc
index 44c41a6c..7cd3ecf8 100644
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform unsigned int8 a = 255, b = 255; // max unsigned int8
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 a = 255; // max unsigned int8
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 255;
+    RET[programIndex] = (uniform unsigned int8) 255;
 }
diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc
index 4d15e49b..c4454cd2 100644
--- a/tests/paddus_vi16.ispc
+++ b/tests/paddus_vi16.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int16 a = 65535, b = 65535; // max unsigned int16
+    varying unsigned int16 a = 65535, b = aFOO[programIndex]; // max unsigned int16
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 65535;
+    RET[programIndex] = (varying unsigned int16) 65535;
 }
diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc
index 77fcec7a..b7b970ff 100644
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int8 a = 255, b = 255; // max unsigned int8
+    varying unsigned int8 a = 255, b = aFOO[programIndex]; // max unsigned int8
     RET[programIndex] = saturating_add(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 255;
+    RET[programIndex] = (varying unsigned int8) 255;
 }
diff --git a/tests/psubs_i16-2.ispc b/tests/psubs_i16-2.ispc
new file mode 100644
index 00000000..ace62b1c
--- /dev/null
+++ b/tests/psubs_i16-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a = 32767; // max signed int16
+    RET[programIndex] = saturating_sub(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (uniform int16) 32767;
+}
diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc
index 163af2da..47f3d2b9 100644
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int16 a = -32768, b = 32767; // min and max signed int16
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a = -32768; // min signed int16
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = -32768;
+    RET[programIndex] = (uniform int16) -32768;
 }
diff --git a/tests/psubs_i8-2.ispc b/tests/psubs_i8-2.ispc
new file mode 100644
index 00000000..6d3d608a
--- /dev/null
+++ b/tests/psubs_i8-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a = 127; // max signed int8
+    RET[programIndex] = saturating_sub(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (uniform int8) 127;
+}
diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc
index 1dba8fe3..fbc24d25 100644
--- a/tests/psubs_i8.ispc
+++ b/tests/psubs_i8.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int8 a = -128, b = 127; // min and max signed int8
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a = -128; // min signed int8
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = -128;
+    RET[programIndex] = (uniform int8) -128;
 }
diff --git a/tests/psubs_vi16-2.ispc b/tests/psubs_vi16-2.ispc
new file mode 100644
index 00000000..ef1b2ef4
--- /dev/null
+++ b/tests/psubs_vi16-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int16 a = 32767, b = aFOO[programIndex]; // min unsigned int16
+    RET[programIndex] = saturating_sub(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (varying int16) 32767;
+}
diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc
index 3208e842..e405a23f 100644
--- a/tests/psubs_vi16.ispc
+++ b/tests/psubs_vi16.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = -32768, b = 32767; // min and max unsigned int16
+    varying int16 a = -32768, b = aFOO[programIndex]; // min unsigned int16
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = -32768;
+    RET[programIndex] = (varying int16) -32768;
 }
diff --git a/tests/psubs_vi8-2.ispc b/tests/psubs_vi8-2.ispc
new file mode 100644
index 00000000..b7fb02c6
--- /dev/null
+++ b/tests/psubs_vi8-2.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    varying int8 a = 127, b = aFOO[programIndex]; // min unsigned int8
+    RET[programIndex] = saturating_sub(a, -b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (varying int8) 127;
+}
diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc
index 143aaf4e..7d852f0a 100644
--- a/tests/psubs_vi8.ispc
+++ b/tests/psubs_vi8.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = -128, b = 127; // min and max unsigned int8
+    varying int8 a = -128, b = aFOO[programIndex]; // min unsigned int8
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = -128;
+    RET[programIndex] = (varying int8) -128;
 }
diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc
index bb62f03f..a7f60603 100644
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 a = 0; // min unsigned int16
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (uniform unsigned int8) 0;
 }
diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc
index 176ecc33..7cb7ecdc 100644
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -1,11 +1,11 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 a = 0; // min unsigned int8
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (uniform unsigned int8) 0;
 }
diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc
index ca58f374..e441b699 100644
--- a/tests/psubus_vi16.ispc
+++ b/tests/psubus_vi16.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int16 a = 0, b = 32767; // min and max unsigned int16
+    varying unsigned int16 a = 0, b = aFOO[programIndex]; // min unsigned int16
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (varying unsigned int16) 0;
 }
diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc
index e730fd7e..7ba5f14a 100644
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -2,10 +2,10 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int8 a = 0, b = 255; // min and max unsigned int8
+    varying unsigned int8 a = 0, b = aFOO[programIndex]; // min unsigned int8
     RET[programIndex] = saturating_sub(a, b);
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 0;
+    RET[programIndex] = (varying unsigned int8) 0;
 }

From 323587f10f2f7a02104a627692afbffcda822cb5 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Thu, 2 Jan 2014 16:48:56 +0400
Subject: [PATCH 11/16] Scalar implementation and implementation for targets
 which don't have h/w instructions

---
 builtins/target-generic-16.ll |   2 +-
 builtins/target-generic-32.ll |   1 +
 builtins/target-generic-4.ll  |   2 +-
 builtins/target-generic-64.ll |   1 +
 builtins/target-generic-8.ll  |   2 +-
 builtins/util.m4              | 231 +++++++++++++++++++---------------
 6 files changed, 134 insertions(+), 105 deletions(-)

diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll
index df04187c..cc5644bc 100644
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`16')
 include(`target-generic-common.ll')
-saturation_arithmetic()
+saturation_arithmetic_novec()
diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll
index 5f89bcdf..8eb31c48 100644
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -31,3 +31,4 @@
 
 define(`WIDTH',`32')
 include(`target-generic-common.ll')
+saturation_arithmetic_novec()
diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll
index e43f45c5..d80c5b91 100644
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`4')
 include(`target-generic-common.ll')
-saturation_arithmetic()
+saturation_arithmetic_novec()
diff --git a/builtins/target-generic-64.ll b/builtins/target-generic-64.ll
index 09443f8e..6a044c41 100644
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -31,3 +31,4 @@
 
 define(`WIDTH',`64')
 include(`target-generic-common.ll')
+saturation_arithmetic_novec()
diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll
index 6b87509d..4353658c 100644
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -31,4 +31,4 @@
 
 define(`WIDTH',`8')
 include(`target-generic-common.ll')
-saturation_arithmetic()
+saturation_arithmetic_novec()
diff --git a/builtins/util.m4 b/builtins/util.m4
index 6f36f71e..de48a0a1 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -187,124 +187,151 @@ define(`saturation_arithmetic',
         WIDTH, `16', `saturation_arithmetic_vec16()',
                      `saturation_arithmetic_uniform()')')
 
-;;uniform saturation arithmetic
+;; utility function used by saturation_arithmetic_uniform below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: {add,sub} (used in constructing function names)
 
-define(`saturation_arithmetic_uniform', `
-declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__padds_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
+define(`saturation_arithmetic_uniform_universal', `
+declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__p$1s_i8(i8 %a0, i8 %a1) {
+  %a0_i16 = sext i8 %a0 to i16
+  %a1_i16 = sext i8 %a1 to i16
+  %res = $1 i16 %a0_i16, %a1_i16
+  %over_mask = icmp sgt i16 %res, 127
+  %over_res = select i1 %over_mask, i16 127, i16 %res
+  %under_mask = icmp slt i16 %res, -128
+  %ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res
+  %ret = trunc i16 %ret_i16 to i8
   ret i8 %ret
 }
 
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__padds_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__p$1s_i16(i16 %a0, i16 %a1) {
+  %a0_i32 = sext i16 %a0 to i32
+  %a1_i32 = sext i16 %a1 to i32
+  %res = $1 i32 %a0_i32, %a1_i32
+  %over_mask = icmp sgt i32 %res, 32767
+  %over_res = select i1 %over_mask, i32 32767, i32 %res
+  %under_mask = icmp slt i32 %res, -32768
+  %ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res
+  %ret = trunc i32 %ret_i32 to i16
   ret i16 %ret
 }
 
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__paddus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
+declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
+define i8 @__p$1us_i8(i8 %a0, i8 %a1) {
+  %a0_i16 = zext i8 %a0 to i16
+  %a1_i16 = zext i8 %a1 to i16
+  %res = $1 i16 %a0_i16, %a1_i16
+  %over_mask = icmp ugt i16 %res, 255
+  %over_res = select i1 %over_mask, i16 255, i16 %res
+  %under_mask = icmp slt i16 %res, 0
+  %ret_i16 = select i1 %under_mask, i16 0, i16 %over_res
+  %ret = trunc i16 %ret_i16 to i8
   ret i8 %ret
 }
 
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__paddus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubs_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubs_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__psubus_i8(i8 %a0, i8 %a1) {
-  sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__psubus_i16(i16 %a0, i16 %a1) {
-  sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
+declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
+define i16 @__p$1us_i16(i16 %a0, i16 %a1) {
+  %a0_i32 = zext i16 %a0 to i32
+  %a1_i32 = zext i16 %a1 to i32
+  %res = $1 i32 %a0_i32, %a1_i32
+  %over_mask = icmp ugt i32 %res, 65535
+  %over_res = select i1 %over_mask, i32 65535, i32 %res
+  %under_mask = icmp slt i32 %res, 0
+  %ret_i32 = select i1 %under_mask, i32 0, i32 %over_res
+  %ret = trunc i32 %ret_i32 to i16
   ret i16 %ret
 }
 ')
 
-;;no vector saturation arithmetic
+;;uniform saturation arithmetic
+
+define(`saturation_arithmetic_uniform', `
+saturation_arithmetic_uniform_universal(sub)
+saturation_arithmetic_uniform_universal(add)
+')
+
+;; create vector constant. Used by saturation_arithmetic_novec_universal below.
+
+define(`const_vector', `
+ifelse(WIDTH,  `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', 
+       WIDTH,  `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+                        `<$1 $2>')')
+                        
+;; utility function used by saturation_arithmetic_novec below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: {add,sub} (used in constructing function names)
+                        
+define(`saturation_arithmetic_novec_universal', `
+define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp sgt <WIDTH x i16> %res, const_vector(i16, 127)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 127), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, -128)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, -128), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+
+define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp sgt <WIDTH x i32> %res, const_vector(i32, 32767)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 32767), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, -32768)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, -32768), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp ugt <WIDTH x i16> %res, const_vector(i16, 255)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 255), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, 0)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, 0), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+  
+define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp ugt <WIDTH x i32> %res, const_vector(i32, 65535)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 65535), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, 0)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, 0), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+')
+
+;; implementation for targets which doesn't have h/w instructions
 
 define(`saturation_arithmetic_novec', `
-define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert1to16(i8, %0, %v0)
-  convert1to16(i8, %1, %v1)    
-  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to1(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert1to8(i16, %0, %v0)
-  convert1to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to1(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert1to16(i8, %0, %v0)
-  convert1to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to1(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert1to8(i16, %0, %v0)
-  convert1to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to1(i16, %r16, %r)  
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert1to16(i8, %0, %v0)
-  convert1to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to1(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert1to8(i16, %0, %v0)
-  convert1to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to1(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
-
-define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
-  convert1to16(i8, %0, %v0)
-  convert1to16(i8, %1, %v1)
-  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
-  convert16to1(i8, %r16, %r)
-  ret <WIDTH x i8> %r
-}
-
-define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
-  convert1to8(i16, %0, %v0)
-  convert1to8(i16, %1, %v1)
-  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
-  convert8to1(i16, %r16, %r)
-  ret <WIDTH x i16> %r
-}
+saturation_arithmetic_novec_universal(sub)
+saturation_arithmetic_novec_universal(add)
 ')
 
 ;;4-wide vector saturation arithmetic

From 97cc5b7f485269e55931df34dd4de630cd4d848a Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Mon, 6 Jan 2014 15:24:09 +0400
Subject: [PATCH 12/16] Added varying CFG and non-overflow part of the tests.

---
 tests/padds_i16-2.ispc  | 11 -----------
 tests/padds_i16.ispc    | 22 +++++++++++++++++++---
 tests/padds_i8-2.ispc   | 11 -----------
 tests/padds_i8.ispc     | 22 +++++++++++++++++++---
 tests/padds_vi16-2.ispc | 11 -----------
 tests/padds_vi16.ispc   | 24 ++++++++++++++++++++----
 tests/padds_vi8-2.ispc  | 11 -----------
 tests/padds_vi8.ispc    | 24 ++++++++++++++++++++----
 tests/paddus_i16.ispc   | 16 +++++++++++++---
 tests/paddus_i8.ispc    | 16 +++++++++++++---
 tests/paddus_vi16.ispc  | 18 ++++++++++++++----
 tests/paddus_vi8.ispc   | 19 +++++++++++++++----
 tests/psubs_i16-2.ispc  | 11 -----------
 tests/psubs_i16.ispc    | 22 +++++++++++++++++++---
 tests/psubs_i8-2.ispc   | 11 -----------
 tests/psubs_i8.ispc     | 22 +++++++++++++++++++---
 tests/psubs_vi16-2.ispc | 11 -----------
 tests/psubs_vi16.ispc   | 24 ++++++++++++++++++++----
 tests/psubs_vi8-2.ispc  | 11 -----------
 tests/psubs_vi8.ispc    | 24 ++++++++++++++++++++----
 tests/psubus_i16.ispc   | 16 +++++++++++++---
 tests/psubus_i8.ispc    | 16 +++++++++++++---
 tests/psubus_vi16.ispc  | 18 ++++++++++++++----
 tests/psubus_vi8.ispc   | 18 ++++++++++++++----
 24 files changed, 265 insertions(+), 144 deletions(-)
 delete mode 100644 tests/padds_i16-2.ispc
 delete mode 100644 tests/padds_i8-2.ispc
 delete mode 100644 tests/padds_vi16-2.ispc
 delete mode 100644 tests/padds_vi8-2.ispc
 delete mode 100644 tests/psubs_i16-2.ispc
 delete mode 100644 tests/psubs_i8-2.ispc
 delete mode 100644 tests/psubs_vi16-2.ispc
 delete mode 100644 tests/psubs_vi8-2.ispc

diff --git a/tests/padds_i16-2.ispc b/tests/padds_i16-2.ispc
deleted file mode 100644
index 83234804..00000000
--- a/tests/padds_i16-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 a = -32768; // min signed int16
-    RET[programIndex] = saturating_add(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int16) -32768;
-}
diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc
index e5456416..c763dd37 100644
--- a/tests/padds_i16.ispc
+++ b/tests/padds_i16.ispc
@@ -2,10 +2,26 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 a = 32767; // max signed int16
-    RET[programIndex] = saturating_add(a, b);
+    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int16) 32767;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int16) 32767;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int16) -32768;
+    }
+    else {
+        RET[programIndex] = (uniform int16) -32763;
+    } 
 }
diff --git a/tests/padds_i8-2.ispc b/tests/padds_i8-2.ispc
deleted file mode 100644
index 9a303d70..00000000
--- a/tests/padds_i8-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 a = -128; // min signed int8
-    RET[programIndex] = saturating_add(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int8) -128;
-}
diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc
index bbcc4cc7..7d272828 100644
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -2,10 +2,26 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 a = 127; // max signed int8
-    RET[programIndex] = saturating_add(a, b);
+    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int8) 127;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int8) 127;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int8) -128;
+    }
+    else {
+        RET[programIndex] = (uniform int8) -123;
+    } 
 }
diff --git a/tests/padds_vi16-2.ispc b/tests/padds_vi16-2.ispc
deleted file mode 100644
index 5f1eda37..00000000
--- a/tests/padds_vi16-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = -32768, b = aFOO[programIndex]; // max signed int16
-    RET[programIndex] = saturating_add(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int16) -32768;
-}
diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc
index e3bd0f51..5834a47a 100644
--- a/tests/padds_vi16.ispc
+++ b/tests/padds_vi16.ispc
@@ -1,11 +1,27 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = 32767, b = aFOO[programIndex]; // max signed int16
-    RET[programIndex] = saturating_add(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int16) 32767;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int16) 32767;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int16) -32768;
+    }
+    else {
+        RET[programIndex] = (varying int16) -32763;
+    } 
 }
diff --git a/tests/padds_vi8-2.ispc b/tests/padds_vi8-2.ispc
deleted file mode 100644
index e3302d18..00000000
--- a/tests/padds_vi8-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = -128, b = aFOO[programIndex]; // max signed int8
-    RET[programIndex] = saturating_add(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int8) -128;
-}
diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc
index df921414..0aca03d4 100644
--- a/tests/padds_vi8.ispc
+++ b/tests/padds_vi8.ispc
@@ -1,11 +1,27 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = 127, b = aFOO[programIndex]; // max signed int8
-    RET[programIndex] = saturating_add(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int8) 127;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int8) 127;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int8) -128;
+    }
+    else {
+        RET[programIndex] = (varying int8) -123;
+    } 
 }
diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc
index e38f6db7..2032f161 100644
--- a/tests/paddus_i16.ispc
+++ b/tests/paddus_i16.ispc
@@ -2,10 +2,20 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int16 a = 65535; // max unsigned int16
-    RET[programIndex] = saturating_add(a, b);
+    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform unsigned int16) 65535;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int16) 65535;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int16) 5;
+    } 
 }
diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc
index 7cd3ecf8..97436a86 100644
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -2,10 +2,20 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int8 a = 255; // max unsigned int8
-    RET[programIndex] = saturating_add(a, b);
+    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform unsigned int8) 255;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int8) 255;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int8) 5;
+    } 
 }
diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc
index c4454cd2..d8bfa000 100644
--- a/tests/paddus_vi16.ispc
+++ b/tests/paddus_vi16.ispc
@@ -1,11 +1,21 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int16 a = 65535, b = aFOO[programIndex]; // max unsigned int16
-    RET[programIndex] = saturating_add(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying unsigned int16) 65535;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int16) 65535;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int16) 5;
+    } 
 }
diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc
index b7b970ff..59baa6fb 100644
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -1,11 +1,22 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int8 a = 255, b = aFOO[programIndex]; // max unsigned int8
-    RET[programIndex] = saturating_add(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying unsigned int8) 255;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int8) 255;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int8) 5;
+    } 
 }
+
diff --git a/tests/psubs_i16-2.ispc b/tests/psubs_i16-2.ispc
deleted file mode 100644
index ace62b1c..00000000
--- a/tests/psubs_i16-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 a = 32767; // max signed int16
-    RET[programIndex] = saturating_sub(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int16) 32767;
-}
diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc
index 47f3d2b9..4f27b3b4 100644
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -2,10 +2,26 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int16 a = -32768; // min signed int16
-    RET[programIndex] = saturating_sub(a, b);
+    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int16) -32768;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int16) -32768;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int16) 32767;
+    }
+    else {
+        RET[programIndex] = (uniform int16) 32762;
+    } 
 }
diff --git a/tests/psubs_i8-2.ispc b/tests/psubs_i8-2.ispc
deleted file mode 100644
index 6d3d608a..00000000
--- a/tests/psubs_i8-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 a = 127; // max signed int8
-    RET[programIndex] = saturating_sub(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int8) 127;
-}
diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc
index fbc24d25..e04867bd 100644
--- a/tests/psubs_i8.ispc
+++ b/tests/psubs_i8.ispc
@@ -2,10 +2,26 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int8 a = -128; // min signed int8
-    RET[programIndex] = saturating_sub(a, b);
+    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform int8) -128;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int8) -128;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int8) 127;
+    }
+    else {
+        RET[programIndex] = (uniform int8) 122;
+    } 
 }
diff --git a/tests/psubs_vi16-2.ispc b/tests/psubs_vi16-2.ispc
deleted file mode 100644
index ef1b2ef4..00000000
--- a/tests/psubs_vi16-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = 32767, b = aFOO[programIndex]; // min unsigned int16
-    RET[programIndex] = saturating_sub(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int16) 32767;
-}
diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc
index e405a23f..df130115 100644
--- a/tests/psubs_vi16.ispc
+++ b/tests/psubs_vi16.ispc
@@ -1,11 +1,27 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int16 a = -32768, b = aFOO[programIndex]; // min unsigned int16
-    RET[programIndex] = saturating_sub(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int16) -32768;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int16) -32768;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int16) 32767;
+    }
+    else {
+        RET[programIndex] = (varying int16) 32762;
+    } 
 }
diff --git a/tests/psubs_vi8-2.ispc b/tests/psubs_vi8-2.ispc
deleted file mode 100644
index b7fb02c6..00000000
--- a/tests/psubs_vi8-2.ispc
+++ /dev/null
@@ -1,11 +0,0 @@
-
-export uniform int width() { return programCount; }
-
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = 127, b = aFOO[programIndex]; // min unsigned int8
-    RET[programIndex] = saturating_sub(a, -b);
-}
-
-export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int8) 127;
-}
diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc
index 7d852f0a..d7e9ff89 100644
--- a/tests/psubs_vi8.ispc
+++ b/tests/psubs_vi8.ispc
@@ -1,11 +1,27 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying int8 a = -128, b = aFOO[programIndex]; // min unsigned int8
-    RET[programIndex] = saturating_sub(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying int8) -128;
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int8) -128;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int8) 127;
+    }
+    else {
+        RET[programIndex] = (varying int8) 122;
+    } 
 }
diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc
index a7f60603..f9ae3568 100644
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -2,10 +2,20 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int8 a = 0; // min unsigned int16
-    RET[programIndex] = saturating_sub(a, b);
+    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform unsigned int8) 0;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int16) 0;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int16) 65530;
+    } 
 }
diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc
index 7cb7ecdc..e6f30b2a 100644
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -2,10 +2,20 @@
 export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int8 a = 0; // min unsigned int8
-    RET[programIndex] = saturating_sub(a, b);
+    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (uniform unsigned int8) 0;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int8) 0;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int8) 250;
+    } 
 }
diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc
index e441b699..0974cc5e 100644
--- a/tests/psubus_vi16.ispc
+++ b/tests/psubus_vi16.ispc
@@ -1,11 +1,21 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int16 a = 0, b = aFOO[programIndex]; // min unsigned int16
-    RET[programIndex] = saturating_sub(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying unsigned int16) 0;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int16) 0;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int16) 65530;
+    } 
 }
diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc
index 7ba5f14a..f7ad65d3 100644
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -1,11 +1,21 @@
 
 export uniform int width() { return programCount; }
 
-export void f_f(uniform float RET[], uniform float aFOO[]) {
-    varying unsigned int8 a = 0, b = aFOO[programIndex]; // min unsigned int8
-    RET[programIndex] = saturating_sub(a, b);
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (varying unsigned int8) 0;
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int8) 0;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int8) 250;
+    } 
 }

From da02236b3ac1d1a663949e81900ab8dbe71111a4 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Mon, 20 Jan 2014 16:06:34 +0400
Subject: [PATCH 13/16] Scalar realization of no-vec functions was replaced
 from builtins to stdlib.ispc.

---
 builtins.cpp                      |  8 ---
 builtins/target-avx-common.ll     |  1 -
 builtins/target-generic-common.ll |  1 -
 builtins/target-sse2-common.ll    |  1 -
 builtins/target-sse4-common.ll    |  1 -
 builtins/util.m4                  | 98 ++++++++++---------------------
 stdlib.ispc                       | 72 ++++++++++++++++++++---
 7 files changed, 94 insertions(+), 88 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index c6828a00..b693ad3a 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -488,12 +488,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
-        "__padds_i8",
-        "__padds_i16",
         "__padds_vi8",
         "__padds_vi16",
-        "__paddus_i8",
-        "__paddus_i16",
         "__paddus_vi8",
         "__paddus_vi16",
         "__popcnt_int32",
@@ -502,12 +498,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__prefetch_read_uniform_2",
         "__prefetch_read_uniform_3",
         "__prefetch_read_uniform_nt",
-        "__psubs_i8",
-        "__psubs_i16",
         "__psubs_vi8",
         "__psubs_vi16",
-        "__psubus_i8",
-        "__psubus_i16",
         "__psubus_vi8",
         "__psubus_vi16",
         "__rcp_uniform_float",
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index d6b577b8..dcca74f0 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -40,7 +40,6 @@ ctlztz()
 define_prefetches()
 define_shuffles()
 aossoa()
-saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 6f5199d8..92b7a18e 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -41,7 +41,6 @@ stdlib_core()
 scans()
 reduce_equal(WIDTH)
 rdrand_decls()
-saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index d8a461aa..ad1d88bc 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -34,7 +34,6 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll
index 2dd5c149..50dd0582 100644
--- a/builtins/target-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -37,7 +37,6 @@ define_prefetches()
 define_shuffles()
 aossoa()
 rdrand_decls()
-saturation_arithmetic_uniform()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
diff --git a/builtins/util.m4 b/builtins/util.m4
index de48a0a1..4bdc501b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -184,73 +184,7 @@ define(`convert32to16', `
 define(`saturation_arithmetic',
 `ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
         WIDTH,  `8', `saturation_arithmetic_vec8()',
-        WIDTH, `16', `saturation_arithmetic_vec16()',
-                     `saturation_arithmetic_uniform()')')
-
-;; utility function used by saturation_arithmetic_uniform below.  This shouldn't be called by
-;; target .ll files directly.
-;; $1: {add,sub} (used in constructing function names)
-
-define(`saturation_arithmetic_uniform_universal', `
-declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__p$1s_i8(i8 %a0, i8 %a1) {
-  %a0_i16 = sext i8 %a0 to i16
-  %a1_i16 = sext i8 %a1 to i16
-  %res = $1 i16 %a0_i16, %a1_i16
-  %over_mask = icmp sgt i16 %res, 127
-  %over_res = select i1 %over_mask, i16 127, i16 %res
-  %under_mask = icmp slt i16 %res, -128
-  %ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res
-  %ret = trunc i16 %ret_i16 to i8
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__p$1s_i16(i16 %a0, i16 %a1) {
-  %a0_i32 = sext i16 %a0 to i32
-  %a1_i32 = sext i16 %a1 to i32
-  %res = $1 i32 %a0_i32, %a1_i32
-  %over_mask = icmp sgt i32 %res, 32767
-  %over_res = select i1 %over_mask, i32 32767, i32 %res
-  %under_mask = icmp slt i32 %res, -32768
-  %ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res
-  %ret = trunc i32 %ret_i32 to i16
-  ret i16 %ret
-}
-
-declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
-define i8 @__p$1us_i8(i8 %a0, i8 %a1) {
-  %a0_i16 = zext i8 %a0 to i16
-  %a1_i16 = zext i8 %a1 to i16
-  %res = $1 i16 %a0_i16, %a1_i16
-  %over_mask = icmp ugt i16 %res, 255
-  %over_res = select i1 %over_mask, i16 255, i16 %res
-  %under_mask = icmp slt i16 %res, 0
-  %ret_i16 = select i1 %under_mask, i16 0, i16 %over_res
-  %ret = trunc i16 %ret_i16 to i8
-  ret i8 %ret
-}
-
-declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
-define i16 @__p$1us_i16(i16 %a0, i16 %a1) {
-  %a0_i32 = zext i16 %a0 to i32
-  %a1_i32 = zext i16 %a1 to i32
-  %res = $1 i32 %a0_i32, %a1_i32
-  %over_mask = icmp ugt i32 %res, 65535
-  %over_res = select i1 %over_mask, i32 65535, i32 %res
-  %under_mask = icmp slt i32 %res, 0
-  %ret_i32 = select i1 %under_mask, i32 0, i32 %over_res
-  %ret = trunc i32 %ret_i32 to i16
-  ret i16 %ret
-}
-')
-
-;;uniform saturation arithmetic
-
-define(`saturation_arithmetic_uniform', `
-saturation_arithmetic_uniform_universal(sub)
-saturation_arithmetic_uniform_universal(add)
-')
+        WIDTH, `16', `saturation_arithmetic_vec16()')')
 
 ;; create vector constant. Used by saturation_arithmetic_novec_universal below.
 
@@ -278,6 +212,7 @@ ifelse(WIDTH,  `4', `<$1 $2, $1 $2, $1 $2, $1 $2>',
 ;; $1: {add,sub} (used in constructing function names)
                         
 define(`saturation_arithmetic_novec_universal', `
+declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
   %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
@@ -290,6 +225,7 @@ define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   ret <WIDTH x i8> %ret
 }
 
+declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
   %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
@@ -302,6 +238,7 @@ define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   ret <WIDTH x i16> %ret
 }
 
+declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
   %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
@@ -313,7 +250,8 @@ define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
   ret <WIDTH x i8> %ret
 }
-  
+
+declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
   %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
@@ -337,6 +275,7 @@ saturation_arithmetic_novec_universal(add)
 ;;4-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec4', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)    
@@ -345,6 +284,7 @@ define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
   ret <4 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
@@ -353,6 +293,7 @@ define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
   ret <4 x i16> %r
 }
 
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
@@ -361,6 +302,7 @@ define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
   ret <4 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
@@ -369,6 +311,7 @@ define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
   ret <4 x i16> %r
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
@@ -377,6 +320,7 @@ define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
   ret <4 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
@@ -385,6 +329,7 @@ define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
   ret <4 x i16> %r
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
   convert4to16(i8, %0, %v0)
   convert4to16(i8, %1, %v1)
@@ -393,6 +338,7 @@ define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
   ret <4 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
   convert4to8(i16, %0, %v0)
   convert4to8(i16, %1, %v1)
@@ -405,6 +351,7 @@ define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
 ;;8-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec8', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
@@ -413,11 +360,13 @@ define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
   ret <8 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
   %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
 
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
@@ -426,11 +375,13 @@ define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
   ret <8 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
   %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
@@ -439,11 +390,13 @@ define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
   ret <8 x i8> %r
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
   %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
   convert8to16(i8, %0, %v0)
   convert8to16(i8, %1, %v1)
@@ -452,6 +405,7 @@ define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
   ret <8 x i8> %r    
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
   %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
@@ -461,41 +415,49 @@ define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
 ;;16-wide vector saturation arithmetic
 
 define(`saturation_arithmetic_vec16', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
   %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
 
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
   ret <16 x i16> %ret
 }
 
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
   %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
 
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
   ret <16 x i16> %ret
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
   %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
   ret <16 x i16> %ret
 }
 
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
   %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
 
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
   binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
   ret <16 x i16> %ret
diff --git a/stdlib.ispc b/stdlib.ispc
index 487b4184..f977abf8 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,34 @@
   #error Unknown value of ISPC_MASK_BITS
 #endif
 
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32768)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483648)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295)
+#endif
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 
@@ -4261,7 +4289,12 @@ static inline void fastmath() {
 // saturation arithmetic
 
 static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
-    return __padds_i8(a, b);
+    uniform unsigned int8 a_unsig = a, b_unsig = b;
+    uniform unsigned int8 result = a_unsig + b_unsig;
+    a_unsig = (a_unsig >> 7) + INT8_MAX;
+    if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
+        result = a_unsig;
+    return result;
 }
 
 static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
@@ -4269,7 +4302,12 @@ static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
 }
 
 static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
-    return __padds_i16(a, b);
+    uniform unsigned int16 a_unsig = a, b_unsig = b;
+    uniform unsigned int16 result = a_unsig + b_unsig;
+    a_unsig = (a_unsig >> 15) + INT16_MAX;
+    if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
+        result = a_unsig;
+    return result;
 }
 
 static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
@@ -4278,7 +4316,9 @@ static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
 
 static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, 
                                                    uniform unsigned int8 b) {
-    return __paddus_i8(a, b);
+    uniform unsigned int8 result = a + b;
+    result |= (-(uniform int8)(result < a));
+    return result;
 }
 
 static inline varying unsigned int8 saturating_add(varying unsigned int8 a, 
@@ -4288,7 +4328,9 @@ static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
 
 static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, 
                                                     uniform unsigned int16 b) {
-    return __paddus_i16(a, b);
+    uniform unsigned int16 result = a + b;
+    result |= (-(uniform int16)(result < a));
+    return result;
 }
 
 static inline varying unsigned int16 saturating_add(varying unsigned int16 a, 
@@ -4297,7 +4339,12 @@ static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
 }
 
 static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
-    return __psubs_i8(a, b);
+    uniform unsigned int8 a_unsig = a, b_unsig = b;
+    uniform unsigned int8 result = a_unsig - b_unsig;
+    a_unsig = (a_unsig >> 7) + INT8_MAX;
+    if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
+        result = a_unsig;
+    return result;
 }
 
 static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
@@ -4305,7 +4352,12 @@ static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
 }
 
 static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
-    return __psubs_i16(a, b);
+    uniform unsigned int16 a_unsig = a, b_unsig = b;
+    uniform unsigned int16 result = a_unsig - b_unsig;
+    a_unsig = (a_unsig >> 15) + INT16_MAX;
+    if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
+        result = a_unsig;
+    return result;
 }
 
 static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
@@ -4314,7 +4366,9 @@ static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
 
 static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, 
                                                    uniform unsigned int8 b) {
-    return __psubus_i8(a, b);
+    uniform unsigned int8 result = a - b;
+    result &= (-(uniform int8)(result <= a));
+    return result;
 }
 
 static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, 
@@ -4324,7 +4378,9 @@ static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
 
 static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, 
                                                     uniform unsigned int16 b) {
-    return __psubus_i16(a, b);
+    uniform unsigned int16 result = a - b;
+    result &= (-(uniform int16)(result <= a));
+    return result;
 }
 
 static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, 

From 1c1614d20755d441074b4084ee41d76e74464b2a Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Sun, 9 Feb 2014 21:39:42 +0400
Subject: [PATCH 14/16] Some errors in comments and code were fixed

---
 builtins/util.m4 | 17 +++--------------
 stdlib.ispc      | 27 ++++++++++++++++++---------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 4bdc501b..a991ae09 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -50,11 +50,11 @@ define(`MASK_HIGH_BIT_ON',
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; vector convertation utilities
-;; convert 1-wide vector into 8-wide vector
+;; convert vector of one width into vector of other width
 ;;
 ;; $1: vector element type
-;; $2: 1-wide vector
-;; $3: 8-wide vector
+;; $2: vector of the first width
+;; $3: vector of the second width
 
 
 define(`convert1to8', `
@@ -129,13 +129,6 @@ define(`convert16to32', `
               i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 
-;; convert 4-wide vector into 8-wide vector
-;;
-;; $1: vector element type
-;; $2: 8-wide vector
-;; $3: 1-wide vector
-
-
 define(`convert8to1', `
   $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
     <1 x i32> <i32 0>
@@ -212,7 +205,6 @@ ifelse(WIDTH,  `4', `<$1 $2, $1 $2, $1 $2, $1 $2>',
 ;; $1: {add,sub} (used in constructing function names)
                         
 define(`saturation_arithmetic_novec_universal', `
-declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
   %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
@@ -225,7 +217,6 @@ define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   ret <WIDTH x i8> %ret
 }
 
-declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
   %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
@@ -238,7 +229,6 @@ define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   ret <WIDTH x i16> %ret
 }
 
-declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
   %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
@@ -251,7 +241,6 @@ define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
   ret <WIDTH x i8> %ret
 }
 
-declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
   %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
   %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
diff --git a/stdlib.ispc b/stdlib.ispc
index f977abf8..9bb13f4e 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -58,15 +58,6 @@
 #endif
 
 /* Limits of integral types. */
-#ifndef INT8_MIN
-#define INT8_MIN               (-128)
-#endif
-#ifndef INT16_MIN
-#define INT16_MIN              (-32768)
-#endif
-#ifndef INT32_MIN
-#define INT32_MIN              (-2147483648)
-#endif
 #ifndef INT8_MAX
 #define INT8_MAX               (127)
 #endif
@@ -76,6 +67,9 @@
 #ifndef INT32_MAX
 #define INT32_MAX              (2147483647)
 #endif
+#ifndef INT64_MAX
+#define INT64_MAX              (9223372036854775807)
+#endif
 #ifndef UINT8_MAX
 #define UINT8_MAX              (255)
 #endif
@@ -85,6 +79,21 @@
 #ifndef UINT32_MAX
 #define UINT32_MAX             (4294967295)
 #endif
+#ifndef UINT64_MAX
+#define UINT64_MAX             (18446744073709551615)
+#endif
+#ifndef INT8_MIN
+#define INT8_MIN               (-INT8_MAX - 1)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-INT16_MAX - 1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-INT32_MAX - 1)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN              (-INT64_MAX - 1)
+#endif
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 

From 65d947e44905d10bf1a2edde5a5c39bd7533c987 Mon Sep 17 00:00:00 2001
From: Vsevolod Livinskij <vsevolod.livinskij@frtk.ru>
Date: Mon, 10 Feb 2014 15:18:48 +0400
Subject: [PATCH 15/16] Else branch with error report was added

---
 builtins/util.m4 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/builtins/util.m4 b/builtins/util.m4
index 86051436..df2adab2 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -177,7 +177,8 @@ define(`convert32to16', `
 define(`saturation_arithmetic',
 `ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
         WIDTH,  `8', `saturation_arithmetic_vec8()',
-        WIDTH, `16', `saturation_arithmetic_vec16()')')
+        WIDTH, `16', `saturation_arithmetic_vec16() ',
+                     `ERROR_unappropriate_width')')
 
 ;; create vector constant. Used by saturation_arithmetic_novec_universal below.
 

From ea0a514e03eb6389099feb1b918d3c2378221a44 Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Tue, 11 Feb 2014 15:33:23 +0400
Subject: [PATCH 16/16] Fix for generic-1

---
 builtins/target-generic-1.ll | 1 -
 builtins/util.m4             | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 94ffe87e..a3de92f3 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,7 +10,6 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
-saturation_arithmetic()
 saturation_arithmetic_novec()
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/util.m4 b/builtins/util.m4
index df2adab2..025018e9 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -178,7 +178,10 @@ define(`saturation_arithmetic',
 `ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
         WIDTH,  `8', `saturation_arithmetic_vec8()',
         WIDTH, `16', `saturation_arithmetic_vec16() ',
-                     `ERROR_unappropriate_width')')
+                     `errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH
+)
+                      m4exit(`1')')
+')
 
 ;; create vector constant. Used by saturation_arithmetic_novec_universal below.