Merge pull request #673 from Vsevolod-Livinskij/master

Saturation arithmetic.
This commit is contained in:
Dmitry Babokin
2014-02-11 16:40:40 +03:00
36 changed files with 961 additions and 3 deletions

View File

@@ -489,12 +489,20 @@ lSetInternalFunctions(llvm::Module *module) {
"__packed_load_active", "__packed_load_active",
"__packed_store_active", "__packed_store_active",
"__packed_store_active2", "__packed_store_active2",
"__padds_vi8",
"__padds_vi16",
"__paddus_vi8",
"__paddus_vi16",
"__popcnt_int32", "__popcnt_int32",
"__popcnt_int64", "__popcnt_int64",
"__prefetch_read_uniform_1", "__prefetch_read_uniform_1",
"__prefetch_read_uniform_2", "__prefetch_read_uniform_2",
"__prefetch_read_uniform_3", "__prefetch_read_uniform_3",
"__prefetch_read_uniform_nt", "__prefetch_read_uniform_nt",
"__psubs_vi8",
"__psubs_vi16",
"__psubus_vi8",
"__psubus_vi16",
"__rcp_uniform_float", "__rcp_uniform_float",
"__rcp_varying_float", "__rcp_varying_float",
"__rcp_uniform_double", "__rcp_uniform_double",

View File

@@ -40,6 +40,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-avx-common.ll') include(`target-avx-common.ll')

View File

@@ -40,6 +40,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-avx-common.ll') include(`target-avx-common.ll')

View File

@@ -32,6 +32,7 @@
include(`target-avx.ll') include(`target-avx.ll')
rdrand_decls() rdrand_decls()
saturation_arithmetic()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max

View File

@@ -35,6 +35,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()') `rdrand_definition()')
saturation_arithmetic()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max

View File

@@ -39,6 +39,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()') `rdrand_definition()')
saturation_arithmetic()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max

View File

@@ -10,6 +10,7 @@ packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
aossoa() aossoa()
saturation_arithmetic_novec()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store ;; masked store

View File

@@ -31,4 +31,4 @@
define(`WIDTH',`16') define(`WIDTH',`16')
include(`target-generic-common.ll') include(`target-generic-common.ll')
saturation_arithmetic_novec()

View File

@@ -31,3 +31,4 @@
define(`WIDTH',`32') define(`WIDTH',`32')
include(`target-generic-common.ll') include(`target-generic-common.ll')
saturation_arithmetic_novec()

View File

@@ -31,4 +31,4 @@
define(`WIDTH',`4') define(`WIDTH',`4')
include(`target-generic-common.ll') include(`target-generic-common.ll')
saturation_arithmetic_novec()

View File

@@ -31,3 +31,4 @@
define(`WIDTH',`64') define(`WIDTH',`64')
include(`target-generic-common.ll') include(`target-generic-common.ll')
saturation_arithmetic_novec()

View File

@@ -31,4 +31,4 @@
define(`WIDTH',`8') define(`WIDTH',`8')
include(`target-generic-common.ll') include(`target-generic-common.ll')
saturation_arithmetic_novec()

View File

@@ -44,6 +44,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse2-common.ll') include(`target-sse2-common.ll')

View File

@@ -41,6 +41,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse2-common.ll') include(`target-sse2-common.ll')

View File

@@ -41,6 +41,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')

View File

@@ -41,6 +41,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')

View File

@@ -44,6 +44,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')

View File

@@ -41,6 +41,7 @@ stdlib_core()
packed_load_and_store() packed_load_and_store()
scans() scans()
int64minmax() int64minmax()
saturation_arithmetic()
include(`target-sse4-common.ll') include(`target-sse4-common.ll')

View File

@@ -49,6 +49,416 @@ define(`MASK_HIGH_BIT_ON',
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector convertation utilities
;; convert vector of one width into vector of other width
;;
;; $1: vector element type
;; $2: vector of the first width
;; $3: vector of the second width
define(`convert1to8', `
$3 = shufflevector <1 x $1> $2, <1 x $1> undef,
<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert1to16', `
$3 = shufflevector <1 x $1> $2, <1 x $1> undef,
<16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert4to8', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert4to16', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert8to16', `
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert4to32', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
<32 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert8to32', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
<32 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert16to32', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
<32 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11,
i32 12, i32 13, i32 14, i32 15
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
')
define(`convert8to1', `
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
<1 x i32> <i32 0>
')
define(`convert16to1', `
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
<1 x i32> <i32 0>
')
define(`convert8to4', `
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
')
define(`convert16to4', `
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
')
define(`convert16to8', `
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
')
define(`convert32to4', `
$3 = shufflevector <32 x $1> $2, <32 x $1> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
')
define(`convert32to8', `
$3 = shufflevector <32 x $1> $2, <32 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3>
')
define(`convert32to16', `
$3 = shufflevector <32 x $1> $2, <32 x $1> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3>
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;saturation arithmetic
define(`saturation_arithmetic',
`ifelse(WIDTH, `4', `saturation_arithmetic_vec4()',
WIDTH, `8', `saturation_arithmetic_vec8()',
WIDTH, `16', `saturation_arithmetic_vec16() ',
`errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH
)
m4exit(`1')')
')
;; create vector constant. Used by saturation_arithmetic_novec_universal below.
define(`const_vector', `
ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>',
WIDTH, `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2,
$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>',
`<$1 $2>')')
;; utility function used by saturation_arithmetic_novec below. This shouldn't be called by
;; target .ll files directly.
;; $1: {add,sub} (used in constructing function names)
define(`saturation_arithmetic_novec_universal', `
define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
%v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
%res = $1 <WIDTH x i16> %v0_i16, %v1_i16
%over_mask = icmp sgt <WIDTH x i16> %res, const_vector(i16, 127)
%over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 127), <WIDTH x i16> %res
%under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, -128)
%ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, -128), <WIDTH x i16> %over_res
%ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
ret <WIDTH x i8> %ret
}
define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
%v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
%res = $1 <WIDTH x i32> %v0_i32, %v1_i32
%over_mask = icmp sgt <WIDTH x i32> %res, const_vector(i32, 32767)
%over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 32767), <WIDTH x i32> %res
%under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, -32768)
%ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, -32768), <WIDTH x i32> %over_res
%ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
ret <WIDTH x i16> %ret
}
define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
%v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
%res = $1 <WIDTH x i16> %v0_i16, %v1_i16
%over_mask = icmp ugt <WIDTH x i16> %res, const_vector(i16, 255)
%over_res = select <WIDTH x i1> %over_mask, <WIDTH x i16> const_vector(i16, 255), <WIDTH x i16> %res
%under_mask = icmp slt <WIDTH x i16> %res, const_vector(i16, 0)
%ret_i16 = select <WIDTH x i1> %under_mask, <WIDTH x i16> const_vector(i16, 0), <WIDTH x i16> %over_res
%ret = trunc <WIDTH x i16> %ret_i16 to <WIDTH x i8>
ret <WIDTH x i8> %ret
}
define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
%v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
%res = $1 <WIDTH x i32> %v0_i32, %v1_i32
%over_mask = icmp ugt <WIDTH x i32> %res, const_vector(i32, 65535)
%over_res = select <WIDTH x i1> %over_mask, <WIDTH x i32> const_vector(i32, 65535), <WIDTH x i32> %res
%under_mask = icmp slt <WIDTH x i32> %res, const_vector(i32, 0)
%ret_i32 = select <WIDTH x i1> %under_mask, <WIDTH x i32> const_vector(i32, 0), <WIDTH x i32> %over_res
%ret = trunc <WIDTH x i32> %ret_i32 to <WIDTH x i16>
ret <WIDTH x i16> %ret
}
')
;; implementation for targets which doesn't have h/w instructions
define(`saturation_arithmetic_novec', `
saturation_arithmetic_novec_universal(sub)
saturation_arithmetic_novec_universal(add)
')
;;4-wide vector saturation arithmetic
define(`saturation_arithmetic_vec4', `
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
convert4to16(i8, %0, %v0)
convert4to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to4(i8, %r16, %r)
ret <4 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
convert4to8(i16, %0, %v0)
convert4to8(i16, %1, %v1)
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
convert8to4(i16, %r16, %r)
ret <4 x i16> %r
}
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
convert4to16(i8, %0, %v0)
convert4to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to4(i8, %r16, %r)
ret <4 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
convert4to8(i16, %0, %v0)
convert4to8(i16, %1, %v1)
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
convert8to4(i16, %r16, %r)
ret <4 x i16> %r
}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
convert4to16(i8, %0, %v0)
convert4to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to4(i8, %r16, %r)
ret <4 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
convert4to8(i16, %0, %v0)
convert4to8(i16, %1, %v1)
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
convert8to4(i16, %r16, %r)
ret <4 x i16> %r
}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
convert4to16(i8, %0, %v0)
convert4to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to4(i8, %r16, %r)
ret <4 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
convert4to8(i16, %0, %v0)
convert4to8(i16, %1, %v1)
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
convert8to4(i16, %r16, %r)
ret <4 x i16> %r
}
')
;;8-wide vector saturation arithmetic
define(`saturation_arithmetic_vec8', `
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
convert8to16(i8, %0, %v0)
convert8to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to8(i8, %r16, %r)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %res
}
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
convert8to16(i8, %0, %v0)
convert8to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to8(i8, %r16, %r)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %res
}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
convert8to16(i8, %0, %v0)
convert8to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to8(i8, %r16, %r)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %res
}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
convert8to16(i8, %0, %v0)
convert8to16(i8, %1, %v1)
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
convert16to8(i8, %r16, %r)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %res
}
')
;;16-wide vector saturation arithmetic
define(`saturation_arithmetic_vec16', `
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
ret <16 x i16> %ret
}
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
ret <16 x i16> %ret
}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
ret <16 x i16> %ret
}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
ret <16 x i16> %ret
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector deconstruction utilities ;; vector deconstruction utilities
;; split 8-wide vector into 2 4-wide vectors ;; split 8-wide vector into 2 4-wide vectors
;; ;;

View File

@@ -57,6 +57,43 @@
#error Unknown value of ISPC_MASK_BITS #error Unknown value of ISPC_MASK_BITS
#endif #endif
/* Limits of integral types. */
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647)
#endif
#ifndef INT64_MAX
#define INT64_MAX (9223372036854775807)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295)
#endif
#ifndef UINT64_MAX
#define UINT64_MAX (18446744073709551615)
#endif
#ifndef INT8_MIN
#define INT8_MIN (-INT8_MAX - 1)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-INT16_MAX - 1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-INT32_MAX - 1)
#endif
#ifndef INT64_MIN
#define INT64_MIN (-INT64_MAX - 1)
#endif
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Low level primitives // Low level primitives
@@ -4345,6 +4382,108 @@ static inline void fastmath() {
__fastmath(); __fastmath();
} }
///////////////////////////////////////////////////////////////////////////
// saturation arithmetic
static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
uniform unsigned int8 a_unsig = a, b_unsig = b;
uniform unsigned int8 result = a_unsig + b_unsig;
a_unsig = (a_unsig >> 7) + INT8_MAX;
if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
result = a_unsig;
return result;
}
static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
return __padds_vi8(a, b);
}
static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
uniform unsigned int16 a_unsig = a, b_unsig = b;
uniform unsigned int16 result = a_unsig + b_unsig;
a_unsig = (a_unsig >> 15) + INT16_MAX;
if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
result = a_unsig;
return result;
}
static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
return __padds_vi16(a, b);
}
static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a,
uniform unsigned int8 b) {
uniform unsigned int8 result = a + b;
result |= (-(uniform int8)(result < a));
return result;
}
static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
varying unsigned int8 b) {
return __paddus_vi8(a, b);
}
static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a,
uniform unsigned int16 b) {
uniform unsigned int16 result = a + b;
result |= (-(uniform int16)(result < a));
return result;
}
static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
varying unsigned int16 b) {
return __paddus_vi16(a, b);
}
static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
uniform unsigned int8 a_unsig = a, b_unsig = b;
uniform unsigned int8 result = a_unsig - b_unsig;
a_unsig = (a_unsig >> 7) + INT8_MAX;
if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
result = a_unsig;
return result;
}
static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
return __psubs_vi8(a, b);
}
static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
uniform unsigned int16 a_unsig = a, b_unsig = b;
uniform unsigned int16 result = a_unsig - b_unsig;
a_unsig = (a_unsig >> 15) + INT16_MAX;
if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
result = a_unsig;
return result;
}
static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
return __psubs_vi16(a, b);
}
static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a,
uniform unsigned int8 b) {
uniform unsigned int8 result = a - b;
result &= (-(uniform int8)(result <= a));
return result;
}
static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
varying unsigned int8 b) {
return __psubus_vi8(a, b);
}
static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a,
uniform unsigned int16 b) {
uniform unsigned int16 result = a - b;
result &= (-(uniform int16)(result <= a));
return result;
}
static inline varying unsigned int16 saturating_sub(varying unsigned int16 a,
varying unsigned int16 b) {
return __psubus_vi16(a, b);
}
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// rdrand // rdrand

27
tests/padds_i16.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_add(a_min, -b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (uniform int16) 32767;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (uniform int16) -32768;
}
else {
RET[programIndex] = (uniform int16) -32763;
}
}

27
tests/padds_i8.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int8 a_max = 127, a_min = -128; // max and min signed int8
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_add(a_min, -b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (uniform int8) 127;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (uniform int8) -128;
}
else {
RET[programIndex] = (uniform int8) -123;
}
}

27
tests/padds_vi16.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_add(a_min, -b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (varying int16) 32767;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (varying int16) -32768;
}
else {
RET[programIndex] = (varying int16) -32763;
}
}

27
tests/padds_vi8.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying int8 a_max = 127, a_min = -128; // max and min signed int8
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_add(a_min, -b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (varying int8) 127;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (varying int8) -128;
}
else {
RET[programIndex] = (varying int8) -123;
}
}

21
tests/paddus_i16.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (uniform unsigned int16) 65535;
}
else {
RET[programIndex] = (uniform unsigned int16) 5;
}
}

21
tests/paddus_i8.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (uniform unsigned int8) 255;
}
else {
RET[programIndex] = (uniform unsigned int8) 5;
}
}

21
tests/paddus_vi16.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (varying unsigned int16) 65535;
}
else {
RET[programIndex] = (varying unsigned int16) 5;
}
}

22
tests/paddus_vi8.ispc Normal file
View File

@@ -0,0 +1,22 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_add(a_max, b);
}
else {
RET[programIndex] = saturating_add(a_min, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (varying unsigned int8) 255;
}
else {
RET[programIndex] = (varying unsigned int8) 5;
}
}

27
tests/psubs_i16.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_sub(a_max, -b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (uniform int16) -32768;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (uniform int16) 32767;
}
else {
RET[programIndex] = (uniform int16) 32762;
}
}

27
tests/psubs_i8.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int8 a_max = 127, a_min = -128; // max and min signed int8
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_sub(a_max, -b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (uniform int8) -128;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (uniform int8) 127;
}
else {
RET[programIndex] = (uniform int8) 122;
}
}

27
tests/psubs_vi16.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying int16 a_max = 32767, a_min = -32768; // max and min signed int16
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_sub(a_max, -b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (varying int16) -32768;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (varying int16) 32767;
}
else {
RET[programIndex] = (varying int16) 32762;
}
}

27
tests/psubs_vi8.ispc Normal file
View File

@@ -0,0 +1,27 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying int8 a_max = 127, a_min = -128; // max and min signed int8
if (programIndex % 3 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else if (programIndex % 3 == 1) {
RET[programIndex] = saturating_sub(a_max, -b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 3 == 0) {
RET[programIndex] = (varying int8) -128;
}
else if (programIndex % 3 == 1) {
RET[programIndex] = (varying int8) 127;
}
else {
RET[programIndex] = (varying int8) 122;
}
}

21
tests/psubus_i16.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (uniform unsigned int16) 0;
}
else {
RET[programIndex] = (uniform unsigned int16) 65530;
}
}

21
tests/psubus_i8.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (uniform unsigned int8) 0;
}
else {
RET[programIndex] = (uniform unsigned int8) 250;
}
}

21
tests/psubus_vi16.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (varying unsigned int16) 0;
}
else {
RET[programIndex] = (varying unsigned int16) 65530;
}
}

21
tests/psubus_vi8.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8
if (programIndex % 2 == 0) {
RET[programIndex] = saturating_sub(a_min, b);
}
else {
RET[programIndex] = saturating_sub(a_max, b);
}
}
export void result(uniform float RET[]) {
if (programIndex % 2 == 0) {
RET[programIndex] = (varying unsigned int8) 0;
}
else {
RET[programIndex] = (varying unsigned int8) 250;
}
}