Merge pull request #673 from Vsevolod-Livinskij/master

Saturation arithmetic.
2014-02-11 16:40:40 +03:00
parent 2570385770 a3c77e6dc6
commit e8039cd822
36 changed files with 961 additions and 3 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -489,12 +489,20 @@ lSetInternalFunctions(llvm::Module *module) {
        "__packed_load_active",
        "__packed_store_active",
        "__packed_store_active2",
        "__padds_vi8",
        "__padds_vi16",
        "__paddus_vi8",
        "__paddus_vi16",
        "__popcnt_int32",
        "__popcnt_int64",
        "__prefetch_read_uniform_1",
        "__prefetch_read_uniform_2",
        "__prefetch_read_uniform_3",
        "__prefetch_read_uniform_nt",
        "__psubs_vi8",
        "__psubs_vi16",
        "__psubus_vi8",
        "__psubus_vi16",
        "__rcp_uniform_float",
        "__rcp_varying_float",
        "__rcp_uniform_double",
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -40,6 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-avx-common.ll')
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -40,6 +40,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-avx-common.ll')
--- a/builtins/target-avx1.ll
+++ b/builtins/target-avx1.ll
@@ -32,6 +32,7 @@
 include(`target-avx.ll')
 rdrand_decls()
 saturation_arithmetic()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
--- a/builtins/target-avx11.ll
+++ b/builtins/target-avx11.ll
@@ -35,6 +35,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
       `rdrand_definition()')
 saturation_arithmetic()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
--- a/builtins/target-avx2.ll
+++ b/builtins/target-avx2.ll
@@ -39,6 +39,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
       LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
       `rdrand_definition()')
 saturation_arithmetic()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -10,6 +10,7 @@ packed_load_and_store()
 scans()
 int64minmax()
 aossoa()
 saturation_arithmetic_novec()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
--- a/builtins/target-generic-16.ll
+++ b/builtins/target-generic-16.ll
@@ -31,4 +31,4 @@
 define(`WIDTH',`16')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-generic-32.ll
+++ b/builtins/target-generic-32.ll
@@ -31,3 +31,4 @@
 define(`WIDTH',`32')
 include(`target-generic-common.ll')
 saturation_arithmetic_novec()
--- a/builtins/target-generic-4.ll
+++ b/builtins/target-generic-4.ll
@@ -31,4 +31,4 @@
 define(`WIDTH',`4')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-generic-64.ll
+++ b/builtins/target-generic-64.ll
@@ -31,3 +31,4 @@
 define(`WIDTH',`64')
 include(`target-generic-common.ll')
 saturation_arithmetic_novec()
--- a/builtins/target-generic-8.ll
+++ b/builtins/target-generic-8.ll
@@ -31,4 +31,4 @@
 define(`WIDTH',`8')
 include(`target-generic-common.ll')
-
+saturation_arithmetic_novec()
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -44,6 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse2-common.ll')
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse2-common.ll')
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse4-common.ll')
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse4-common.ll')
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -44,6 +44,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse4-common.ll')
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -41,6 +41,7 @@ stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 saturation_arithmetic()
 include(`target-sse4-common.ll')
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,416 @@ define(`MASK_HIGH_BIT_ON',
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector convertation utilities
 ;; convert vector of one width into vector of other width
 ;;
 ;; $1: vector element type
 ;; $2: vector of the first width
 ;; $3: vector of the second width
 define(`convert1to8', `
  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
  <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
             i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert1to16', `
  $3 = shufflevector <1 x $1> $2, <1 x $1> undef,
  <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, 
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert4to8', `
  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
  <8 x i32> <i32 0, i32 1, i32 2, i32 3, 
             i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert4to16', `
  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
  <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert8to16', `
  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
  <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert4to32', `
  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert8to32', `
  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
  <32 x i32> <i32 0, i32 1, i32 2, i32 3, 
              i32 4, i32 5, i32 6, i32 7,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert16to32', `
  $3 = shufflevector <4 x $1> $2, <4 x $1> undef,
  <32 x i32> <i32  0, i32 1,  i32  2, i32  3, 
              i32  4, i32 5,  i32  6, i32  7,
              i32  8, i32 9,  i32 10, i32 11,
              i32 12, i32 13, i32 14, i32 15 
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef,
              i32 undef, i32 undef, i32 undef, i32 undef>
 ')
 define(`convert8to1', `
  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
    <1 x i32> <i32 0>
 ')
 define(`convert16to1', `
  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
    <1 x i32> <i32 0>
 ')
 define(`convert8to4', `
  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ')
 define(`convert16to4', `
  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ')
 define(`convert16to8', `
  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
  <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ')
 define(`convert32to4', `
  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ')
 define(`convert32to8', `
  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
    <8 x i32> <i32 0, i32 1, i32 2, i32 3>
 ')
 define(`convert32to16', `
  $3 = shufflevector <32 x $1> $2, <32 x $1> undef,
    <16 x i32> <i32 0, i32 1, i32 2, i32 3>
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;saturation arithmetic
 define(`saturation_arithmetic',
 `ifelse(WIDTH,  `4', `saturation_arithmetic_vec4()', 
        WIDTH,  `8', `saturation_arithmetic_vec8()',
        WIDTH, `16', `saturation_arithmetic_vec16() ',
                     `errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH
 )
                      m4exit(`1')')
 ')
 ;; create vector constant. Used by saturation_arithmetic_novec_universal below.
 define(`const_vector', `
 ifelse(WIDTH,  `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', 
       WIDTH,  `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
       WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
       WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                      $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
       WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
                       $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
                        `<$1 $2>')')
 ;; utility function used by saturation_arithmetic_novec below.  This shouldn't be called by
 ;; target .ll files directly.
 ;; $1: {add,sub} (used in constructing function names)
 define(`saturation_arithmetic_novec_universal', `
 define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
  %v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
  %v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
  %over_mask = icmp sgt <WIDTH x i16> %res, const_vector(i16, 127)
  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 127), <WIDTH x i16> %res
  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, -128)
  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, -128), <WIDTH x i16> %over_res
  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
  ret <WIDTH x i8> %ret
 }
 define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
  %v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
  %v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
  %over_mask = icmp sgt <WIDTH x i32> %res, const_vector(i32, 32767)
  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 32767), <WIDTH x i32> %res
  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, -32768)
  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, -32768), <WIDTH x i32> %over_res
  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
  ret <WIDTH x i16> %ret
 }
 define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
  %v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
  %v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
  %res = $1 <WIDTH x i16> %v0_i16, %v1_i16
  %over_mask = icmp ugt <WIDTH x i16> %res, const_vector(i16, 255)
  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 255), <WIDTH x i16> %res
  %under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, 0)
  %ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, 0), <WIDTH x i16> %over_res
  %ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
  ret <WIDTH x i8> %ret
 }
 define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
  %v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
  %v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
  %res = $1 <WIDTH x i32> %v0_i32, %v1_i32
  %over_mask = icmp ugt <WIDTH x i32> %res, const_vector(i32, 65535)
  %over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 65535), <WIDTH x i32> %res
  %under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, 0)
  %ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, 0), <WIDTH x i32> %over_res
  %ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
  ret <WIDTH x i16> %ret
 }
 ')
 ;; implementation for targets which doesn't have h/w instructions
 define(`saturation_arithmetic_novec', `
 saturation_arithmetic_novec_universal(sub)
 saturation_arithmetic_novec_universal(add)
 ')
 ;;4-wide vector saturation arithmetic
 define(`saturation_arithmetic_vec4', `
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
  convert4to16(i8, %0, %v0)
  convert4to16(i8, %1, %v1)    
  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to4(i8, %r16, %r)
  ret <4 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
  convert4to8(i16, %0, %v0)
  convert4to8(i16, %1, %v1)
  %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
  convert8to4(i16, %r16, %r)
  ret <4 x i16> %r
 }
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
  convert4to16(i8, %0, %v0)
  convert4to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to4(i8, %r16, %r)
  ret <4 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
  convert4to8(i16, %0, %v0)
  convert4to8(i16, %1, %v1)
  %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
  convert8to4(i16, %r16, %r)  
  ret <4 x i16> %r
 }
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
  convert4to16(i8, %0, %v0)
  convert4to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to4(i8, %r16, %r)
  ret <4 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
  convert4to8(i16, %0, %v0)
  convert4to8(i16, %1, %v1)
  %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
  convert8to4(i16, %r16, %r)
  ret <4 x i16> %r
 }
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
  convert4to16(i8, %0, %v0)
  convert4to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to4(i8, %r16, %r)
  ret <4 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
  convert4to8(i16, %0, %v0)
  convert4to8(i16, %1, %v1)
  %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
  convert8to4(i16, %r16, %r)
  ret <4 x i16> %r
 }
 ')
 ;;8-wide vector saturation arithmetic
 define(`saturation_arithmetic_vec8', `
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
  convert8to16(i8, %0, %v0)
  convert8to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to8(i8, %r16, %r)
  ret <8 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
 }
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
  convert8to16(i8, %0, %v0)
  convert8to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to8(i8, %r16, %r)
  ret <8 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
 }
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
  convert8to16(i8, %0, %v0)
  convert8to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to8(i8, %r16, %r)
  ret <8 x i8> %r
 }
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
 }
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
  convert8to16(i8, %0, %v0)
  convert8to16(i8, %1, %v1)
  %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
  convert16to8(i8, %r16, %r)
  ret <8 x i8> %r    
 }
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
  ret <8 x i16> %res
 }
 ')
 ;;16-wide vector saturation arithmetic
 define(`saturation_arithmetic_vec16', `
 declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <16 x i8> %res
 }
 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
  binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
  ret <16 x i16> %ret
 }
 declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <16 x i8> %res
 }
 declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
  binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)  
  ret <16 x i16> %ret
 }
 declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <16 x i8> %res
 }
 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
  binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
  ret <16 x i16> %ret
 }
 declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
  ret <16 x i8> %res
 }
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
  binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)  
  ret <16 x i16> %ret
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector deconstruction utilities
 ;; split 8-wide vector into 2 4-wide vectors
 ;;
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -57,6 +57,43 @@
  #error Unknown value of ISPC_MASK_BITS
 #endif
 /* Limits of integral types. */
 #ifndef INT8_MAX
 #define INT8_MAX               (127)
 #endif
 #ifndef INT16_MAX
 #define INT16_MAX              (32767)
 #endif
 #ifndef INT32_MAX
 #define INT32_MAX              (2147483647)
 #endif
 #ifndef INT64_MAX
 #define INT64_MAX              (9223372036854775807)
 #endif
 #ifndef UINT8_MAX
 #define UINT8_MAX              (255)
 #endif
 #ifndef UINT16_MAX
 #define UINT16_MAX             (65535)
 #endif
 #ifndef UINT32_MAX
 #define UINT32_MAX             (4294967295)
 #endif
 #ifndef UINT64_MAX
 #define UINT64_MAX             (18446744073709551615)
 #endif
 #ifndef INT8_MIN
 #define INT8_MIN               (-INT8_MAX - 1)
 #endif
 #ifndef INT16_MIN
 #define INT16_MIN              (-INT16_MAX - 1)
 #endif
 #ifndef INT32_MIN
 #define INT32_MIN              (-INT32_MAX - 1)
 #endif
 #ifndef INT64_MIN
 #define INT64_MIN              (-INT64_MAX - 1)
 #endif
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
@@ -4345,6 +4382,108 @@ static inline void fastmath() {
    __fastmath();
 }
 ///////////////////////////////////////////////////////////////////////////
 // saturation arithmetic
 static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
    uniform unsigned int8 a_unsig = a, b_unsig = b;
    uniform unsigned int8 result = a_unsig + b_unsig;
    a_unsig = (a_unsig >> 7) + INT8_MAX;
    if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
        result = a_unsig;
    return result;
 }
 static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
    return __padds_vi8(a, b);
 }
 static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
    uniform unsigned int16 a_unsig = a, b_unsig = b;
    uniform unsigned int16 result = a_unsig + b_unsig;
    a_unsig = (a_unsig >> 15) + INT16_MAX;
    if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
        result = a_unsig;
    return result;
 }
 static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
    return __padds_vi16(a, b);
 }
 static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, 
                                                   uniform unsigned int8 b) {
    uniform unsigned int8 result = a + b;
    result |= (-(uniform int8)(result < a));
    return result;
 }
 static inline varying unsigned int8 saturating_add(varying unsigned int8 a, 
                                                   varying unsigned int8 b) {
    return __paddus_vi8(a, b);
 }
 static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, 
                                                    uniform unsigned int16 b) {
    uniform unsigned int16 result = a + b;
    result |= (-(uniform int16)(result < a));
    return result;
 }
 static inline varying unsigned int16 saturating_add(varying unsigned int16 a, 
                                                    varying unsigned int16 b) {
    return __paddus_vi16(a, b);
 }
 static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
    uniform unsigned int8 a_unsig = a, b_unsig = b;
    uniform unsigned int8 result = a_unsig - b_unsig;
    a_unsig = (a_unsig >> 7) + INT8_MAX;
    if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
        result = a_unsig;
    return result;
 }
 static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
    return __psubs_vi8(a, b);
 }
 static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
    uniform unsigned int16 a_unsig = a, b_unsig = b;
    uniform unsigned int16 result = a_unsig - b_unsig;
    a_unsig = (a_unsig >> 15) + INT16_MAX;
    if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
        result = a_unsig;
    return result;
 }
 static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
    return __psubs_vi16(a, b);
 }
 static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, 
                                                   uniform unsigned int8 b) {
    uniform unsigned int8 result = a - b;
    result &= (-(uniform int8)(result <= a));
    return result;
 }
 static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, 
                                                   varying unsigned int8 b) {
    return __psubus_vi8(a, b);
 }
 static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, 
                                                    uniform unsigned int16 b) {
    uniform unsigned int16 result = a - b;
    result &= (-(uniform int16)(result <= a));
    return result;
 }
 static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, 
                                                    varying unsigned int16 b) {
    return __psubus_vi16(a, b);
 }
 ///////////////////////////////////////////////////////////////////////////
 // rdrand
--- a/tests/padds_i16.ispc
+++ b/tests/padds_i16.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_add(a_min, -b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (uniform int16) 32767;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (uniform int16) -32768;
    }
    else {
        RET[programIndex] = (uniform int16) -32763;
    } 
 }
--- a/tests/padds_i8.ispc
+++ b/tests/padds_i8.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_add(a_min, -b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (uniform int8) 127;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (uniform int8) -128;
    }
    else {
        RET[programIndex] = (uniform int8) -123;
    } 
 }
--- a/tests/padds_vi16.ispc
+++ b/tests/padds_vi16.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_add(a_min, -b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (varying int16) 32767;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (varying int16) -32768;
    }
    else {
        RET[programIndex] = (varying int16) -32763;
    } 
 }
--- a/tests/padds_vi8.ispc
+++ b/tests/padds_vi8.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying int8 a_max = 127, a_min = -128; // max and min signed int8
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_add(a_min, -b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (varying int8) 127;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (varying int8) -128;
    }
    else {
        RET[programIndex] = (varying int8) -123;
    } 
 }
--- a/tests/paddus_i16.ispc
+++ b/tests/paddus_i16.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (uniform unsigned int16) 65535;
    }
    else {
        RET[programIndex] = (uniform unsigned int16) 5;
    } 
 }
--- a/tests/paddus_i8.ispc
+++ b/tests/paddus_i8.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (uniform unsigned int8) 255;
    }
    else {
        RET[programIndex] = (uniform unsigned int8) 5;
    } 
 }
--- a/tests/paddus_vi16.ispc
+++ b/tests/paddus_vi16.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (varying unsigned int16) 65535;
    }
    else {
        RET[programIndex] = (varying unsigned int16) 5;
    } 
 }
--- a/tests/paddus_vi8.ispc
+++ b/tests/paddus_vi8.ispc
@@ -0,0 +1,22 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_add(a_max, b);
    }
    else {
        RET[programIndex] = saturating_add(a_min, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (varying unsigned int8) 255;
    }
    else {
        RET[programIndex] = (varying unsigned int8) 5;
    } 
 }
--- a/tests/psubs_i16.ispc
+++ b/tests/psubs_i16.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_sub(a_max, -b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (uniform int16) -32768;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (uniform int16) 32767;
    }
    else {
        RET[programIndex] = (uniform int16) 32762;
    } 
 }
--- a/tests/psubs_i8.ispc
+++ b/tests/psubs_i8.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 a_max = 127, a_min = -128; // max and min signed int8
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_sub(a_max, -b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (uniform int8) -128;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (uniform int8) 127;
    }
    else {
        RET[programIndex] = (uniform int8) 122;
    } 
 }
--- a/tests/psubs_vi16.ispc
+++ b/tests/psubs_vi16.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_sub(a_max, -b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (varying int16) -32768;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (varying int16) 32767;
    }
    else {
        RET[programIndex] = (varying int16) 32762;
    } 
 }
--- a/tests/psubs_vi8.ispc
+++ b/tests/psubs_vi8.ispc
@@ -0,0 +1,27 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying int8 a_max = 127, a_min = -128; // max and min signed int8
    if (programIndex % 3 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = saturating_sub(a_max, -b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 3 == 0) {
        RET[programIndex] = (varying int8) -128;
    }
    else if (programIndex % 3 == 1) {
        RET[programIndex] = (varying int8) 127;
    }
    else {
        RET[programIndex] = (varying int8) 122;
    } 
 }
--- a/tests/psubus_i16.ispc
+++ b/tests/psubus_i16.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (uniform unsigned int16) 0;
    }
    else {
        RET[programIndex] = (uniform unsigned int16) 65530;
    } 
 }
--- a/tests/psubus_i8.ispc
+++ b/tests/psubus_i8.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (uniform unsigned int8) 0;
    }
    else {
        RET[programIndex] = (uniform unsigned int8) 250;
    } 
 }
--- a/tests/psubus_vi16.ispc
+++ b/tests/psubus_vi16.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (varying unsigned int16) 0;
    }
    else {
        RET[programIndex] = (varying unsigned int16) 65530;
    } 
 }
--- a/tests/psubus_vi8.ispc
+++ b/tests/psubus_vi8.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
    if (programIndex % 2 == 0) {
        RET[programIndex] = saturating_sub(a_min, b);
    }
    else {
        RET[programIndex] = saturating_sub(a_max, b);
    } 
 }
 export void result(uniform float RET[]) {
    if (programIndex % 2 == 0) {
        RET[programIndex] = (varying unsigned int8) 0;
    }
    else {
        RET[programIndex] = (varying unsigned int8) 250;
    } 
 }