Merge pull request #673 from Vsevolod-Livinskij/master

Saturation arithmetic.
2014-02-11 16:40:40 +03:00
parent 2570385770 a3c77e6dc6
commit e8039cd822
36 changed files with 961 additions and 3 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -489,12 +489,20 @@ lSetInternalFunctions(llvm::Module *module) {
        "__packed_load_active",
        "__packed_store_active",
        "__packed_store_active2",
+        "__padds_vi8",
+        "__padds_vi16",
+        "__paddus_vi8",
+        "__paddus_vi16",
        "__popcnt_int32",
        "__popcnt_int64",
        "__prefetch_read_uniform_1",
        "__prefetch_read_uniform_2",
        "__prefetch_read_uniform_3",
        "__prefetch_read_uniform_nt",
+        "__psubs_vi8",
+        "__psubs_vi16",
+        "__psubus_vi8",
+        "__psubus_vi16",
        "__rcp_uniform_float",
        "__rcp_varying_float",
        "__rcp_uniform_double",
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -40,6 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-avx-common.ll')

--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -40,6 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-avx-common.ll')

--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -32,6 +32,7 @@
 include(`target-avx.ll')

 rdrand_decls()
+saturation_arithmetic()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -35,6 +35,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
       `rdrand_definition()')

+saturation_arithmetic()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -39,6 +39,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
       `rdrand_definition()')

+saturation_arithmetic()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max

--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,6 +10,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
+saturation_arithmetic_novec()

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -31,4 +31,4 @@

 define(`WIDTH',`16')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -31,3 +31,4 @@

 define(`WIDTH',`32')
 include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -31,4 +31,4 @@

 define(`WIDTH',`4')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -31,3 +31,4 @@

 define(`WIDTH',`64')
 include(`target-generic-common.ll')
+saturation_arithmetic_novec()
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -31,4 +31,4 @@

 define(`WIDTH',`8')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -44,6 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse2-common.ll')

--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse2-common.ll')

--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse4-common.ll')

--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse4-common.ll')

--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -44,6 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse4-common.ll')

--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
+saturation_arithmetic()

 include(`target-sse4-common.ll')

--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,416 @@ define(`MASK_HIGH_BIT_ON',

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+;; vector convertation utilities
+;; convert vector of one width into vector of other width
+;;
+;; $1: vector element type
+;; $2: vector of the first width
+;; $3: vector of the second width
+
+
+define(`convert1to8', `
+  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
+  <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
+             i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+
+define(`convert1to16', `
+  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
+  <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert4to8', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
+             i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert4to16', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert8to16', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+  <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert4to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert8to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
+              i32 4, i32 5, i32 6, i32 7,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert16to32', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
+  <32 x i32> <i32  0, i32 1,  i32  2, i32  3, 
+              i32  4, i32 5,  i32  6, i32  7,
+              i32  8, i32 9,  i32 10, i32 11,
+              i32 12, i32 13, i32 14, i32 15 
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+')
+
+define(`convert8to1', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <1 x i32> <i32 0>
+')
+
+
+define(`convert16to1', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <1 x i32> <i32 0>
+')
+
+define(`convert8to4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+
+define(`convert16to4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert16to8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+  <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`convert32to4', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert32to8', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+define(`convert32to16', `
+  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;saturation arithmetic
+ 
+define(`saturation_arithmetic',
+`ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
+        WIDTH,  `8', `saturation_arithmetic_vec8()',
+        WIDTH, `16', `saturation_arithmetic_vec16() ',
+                     `errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH
+)
+                      m4exit(`1')')
+')
+
+;; create vector constant. Used by saturation_arithmetic_novec_universal below.
+
+define(`const_vector', `
+ifelse(WIDTH,  `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', 
+       WIDTH,  `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+       WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
+                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
+                        `<$1 $2>')')
+                        
+;; utility function used by saturation_arithmetic_novec below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: {add,sub} (used in constructing function names)
+                        
+define(`saturation_arithmetic_novec_universal', `
+define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp sgt <WIDTH x i16> %res, const_vector(i16, 127)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 127), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, -128)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, -128), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+
+define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp sgt <WIDTH x i32> %res, const_vector(i32, 32767)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 32767), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, -32768)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, -32768), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+
+define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
+  %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
+  %over_mask = icmp ugt <WIDTH x i16> %res, const_vector(i16, 255)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 255), <WIDTH x i16> %res
+  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, 0)
+  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, 0), <WIDTH x i16> %over_res
+  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
+  ret <WIDTH x i8> %ret
+}
+
+define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
+  %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
+  %over_mask = icmp ugt <WIDTH x i32> %res, const_vector(i32, 65535)
+  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 65535), <WIDTH x i32> %res
+  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, 0)
+  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, 0), <WIDTH x i32> %over_res
+  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
+  ret <WIDTH x i16> %ret
+}
+')
+
+;; implementation for targets which doesn't have h/w instructions
+
+define(`saturation_arithmetic_novec', `
+saturation_arithmetic_novec_universal(sub)
+saturation_arithmetic_novec_universal(add)
+')
+
+;;4-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec4', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)    
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <4 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <4 x i16> %r
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <4 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)  
+  ret <4 x i16> %r
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <4 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <4 x i16> %r
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
+  convert4to16(i8, %0, %v0)
+  convert4to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to4(i8, %r16, %r)
+  ret <4 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
+  convert4to8(i16, %0, %v0)
+  convert4to8(i16, %1, %v1)
+  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
+  convert8to4(i16, %r16, %r)
+  ret <4 x i16> %r
+}
+')
+
+;;8-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec8', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
+  convert8to16(i8, %0, %v0)
+  convert8to16(i8, %1, %v1)
+  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
+  convert16to8(i8, %r16, %r)
+  ret <8 x i8> %r    
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %res
+}
+')
+
+;;16-wide vector saturation arithmetic
+
+define(`saturation_arithmetic_vec16', `
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
+  ret <16 x i16> %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
+  ret <16 x i16> %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
+  ret <16 x i16> %ret
+}
+
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
+  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
+  ret <16 x i16> %ret
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; vector deconstruction utilities
 ;; split 8-wide vector into 2 4-wide vectors
 ;;
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,43 @@
  #error Unknown value of ISPC_MASK_BITS
 #endif

+/* Limits of integral types. */
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef INT64_MAX
+#define INT64_MAX              (9223372036854775807)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295)
+#endif
+#ifndef UINT64_MAX
+#define UINT64_MAX             (18446744073709551615)
+#endif
+#ifndef INT8_MIN
+#define INT8_MIN               (-INT8_MAX - 1)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-INT16_MAX - 1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-INT32_MAX - 1)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN              (-INT64_MAX - 1)
+#endif
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives

@@ -4345,6 +4382,108 @@ static inline void fastmath() {
    __fastmath();
 }

+///////////////////////////////////////////////////////////////////////////
+// saturation arithmetic
+
+static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
+    uniform unsigned int8 a_unsig = a, b_unsig = b;
+    uniform unsigned int8 result = a_unsig + b_unsig;
+    a_unsig = (a_unsig >> 7) + INT8_MAX;
+    if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
+        result = a_unsig;
+    return result;
+}
+
+static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
+    return __padds_vi8(a, b);
+}
+
+static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
+    uniform unsigned int16 a_unsig = a, b_unsig = b;
+    uniform unsigned int16 result = a_unsig + b_unsig;
+    a_unsig = (a_unsig >> 15) + INT16_MAX;
+    if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
+        result = a_unsig;
+    return result;
+}
+
+static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
+    return __padds_vi16(a, b);
+}
+
+static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, 
+                                                   uniform unsigned int8 b) {
+    uniform unsigned int8 result = a + b;
+    result |= (-(uniform int8)(result < a));
+    return result;
+}
+
+static inline varying unsigned int8 saturating_add(varying unsigned int8 a, 
+                                                   varying unsigned int8 b) {
+    return __paddus_vi8(a, b);
+}
+
+static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, 
+                                                    uniform unsigned int16 b) {
+    uniform unsigned int16 result = a + b;
+    result |= (-(uniform int16)(result < a));
+    return result;
+}
+
+static inline varying unsigned int16 saturating_add(varying unsigned int16 a, 
+                                                    varying unsigned int16 b) {
+    return __paddus_vi16(a, b);
+}
+
+static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
+    uniform unsigned int8 a_unsig = a, b_unsig = b;
+    uniform unsigned int8 result = a_unsig - b_unsig;
+    a_unsig = (a_unsig >> 7) + INT8_MAX;
+    if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
+        result = a_unsig;
+    return result;
+}
+
+static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
+    return __psubs_vi8(a, b);
+}
+
+static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
+    uniform unsigned int16 a_unsig = a, b_unsig = b;
+    uniform unsigned int16 result = a_unsig - b_unsig;
+    a_unsig = (a_unsig >> 15) + INT16_MAX;
+    if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
+        result = a_unsig;
+    return result;
+}
+
+static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
+    return __psubs_vi16(a, b);
+}
+
+static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, 
+                                                   uniform unsigned int8 b) {
+    uniform unsigned int8 result = a - b;
+    result &= (-(uniform int8)(result <= a));
+    return result;
+}
+
+static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, 
+                                                   varying unsigned int8 b) {
+    return __psubus_vi8(a, b);
+}
+
+static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, 
+                                                    uniform unsigned int16 b) {
+    uniform unsigned int16 result = a - b;
+    result &= (-(uniform int16)(result <= a));
+    return result;
+}
+
+static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, 
+                                                    varying unsigned int16 b) {
+    return __psubus_vi16(a, b);
+}
 ///////////////////////////////////////////////////////////////////////////
 // rdrand

--- a/tests/padds_i16.ispc
+++ b/tests/padds_i16.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int16) 32767;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int16) -32768;
+    }
+    else {
+        RET[programIndex] = (uniform int16) -32763;
+    } 
+}
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int8) 127;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int8) -128;
+    }
+    else {
+        RET[programIndex] = (uniform int8) -123;
+    } 
+}
--- a/tests/padds_vi16.ispc
+++ b/tests/padds_vi16.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int16) 32767;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int16) -32768;
+    }
+    else {
+        RET[programIndex] = (varying int16) -32763;
+    } 
+}
--- a/tests/padds_vi8.ispc
+++ b/tests/padds_vi8.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_add(a_min, -b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int8) 127;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int8) -128;
+    }
+    else {
+        RET[programIndex] = (varying int8) -123;
+    } 
+}
--- a/tests/paddus_i16.ispc
+++ b/tests/paddus_i16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int16) 65535;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int16) 5;
+    } 
+}
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int8) 255;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int8) 5;
+    } 
+}
--- a/tests/paddus_vi16.ispc
+++ b/tests/paddus_vi16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int16) 65535;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int16) 5;
+    } 
+}
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_add(a_max, b);
+    }
+    else {
+        RET[programIndex] = saturating_add(a_min, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int8) 255;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int8) 5;
+    } 
+}
+
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int16) -32768;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int16) 32767;
+    }
+    else {
+        RET[programIndex] = (uniform int16) 32762;
+    } 
+}
--- a/tests/psubs_i8.ispc
+++ b/tests/psubs_i8.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (uniform int8) -128;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (uniform int8) 127;
+    }
+    else {
+        RET[programIndex] = (uniform int8) 122;
+    } 
+}
--- a/tests/psubs_vi16.ispc
+++ b/tests/psubs_vi16.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int16) -32768;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int16) 32767;
+    }
+    else {
+        RET[programIndex] = (varying int16) 32762;
+    } 
+}
--- a/tests/psubs_vi8.ispc
+++ b/tests/psubs_vi8.ispc
@@ -0,0 +1,27 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying int8 a_max = 127, a_min = -128; // max and min signed int8
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = saturating_sub(a_max, -b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 3 == 0) {
+        RET[programIndex] = (varying int8) -128;
+    }
+    else if (programIndex % 3 == 1) {
+        RET[programIndex] = (varying int8) 127;
+    }
+    else {
+        RET[programIndex] = (varying int8) 122;
+    } 
+}
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int16) 0;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int16) 65530;
+    } 
+}
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (uniform unsigned int8) 0;
+    }
+    else {
+        RET[programIndex] = (uniform unsigned int8) 250;
+    } 
+}
--- a/tests/psubus_vi16.ispc
+++ b/tests/psubus_vi16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int16) 0;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int16) 65530;
+    } 
+}
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = saturating_sub(a_min, b);
+    }
+    else {
+        RET[programIndex] = saturating_sub(a_max, b);
+    } 
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex % 2 == 0) {
+        RET[programIndex] = (varying unsigned int8) 0;
+    }
+    else {
+        RET[programIndex] = (varying unsigned int8) 250;
+    } 
+}