Scalar realization of no-vec functions was replaced from builtins to stdlib.ispc.
This commit is contained in:
@@ -488,12 +488,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__num_cores",
|
"__num_cores",
|
||||||
"__packed_load_active",
|
"__packed_load_active",
|
||||||
"__packed_store_active",
|
"__packed_store_active",
|
||||||
"__padds_i8",
|
|
||||||
"__padds_i16",
|
|
||||||
"__padds_vi8",
|
"__padds_vi8",
|
||||||
"__padds_vi16",
|
"__padds_vi16",
|
||||||
"__paddus_i8",
|
|
||||||
"__paddus_i16",
|
|
||||||
"__paddus_vi8",
|
"__paddus_vi8",
|
||||||
"__paddus_vi16",
|
"__paddus_vi16",
|
||||||
"__popcnt_int32",
|
"__popcnt_int32",
|
||||||
@@ -502,12 +498,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__prefetch_read_uniform_2",
|
"__prefetch_read_uniform_2",
|
||||||
"__prefetch_read_uniform_3",
|
"__prefetch_read_uniform_3",
|
||||||
"__prefetch_read_uniform_nt",
|
"__prefetch_read_uniform_nt",
|
||||||
"__psubs_i8",
|
|
||||||
"__psubs_i16",
|
|
||||||
"__psubs_vi8",
|
"__psubs_vi8",
|
||||||
"__psubs_vi16",
|
"__psubs_vi16",
|
||||||
"__psubus_i8",
|
|
||||||
"__psubus_i16",
|
|
||||||
"__psubus_vi8",
|
"__psubus_vi8",
|
||||||
"__psubus_vi16",
|
"__psubus_vi16",
|
||||||
"__rcp_uniform_float",
|
"__rcp_uniform_float",
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ ctlztz()
|
|||||||
define_prefetches()
|
define_prefetches()
|
||||||
define_shuffles()
|
define_shuffles()
|
||||||
aossoa()
|
aossoa()
|
||||||
saturation_arithmetic_uniform()
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding floats
|
;; rounding floats
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ stdlib_core()
|
|||||||
scans()
|
scans()
|
||||||
reduce_equal(WIDTH)
|
reduce_equal(WIDTH)
|
||||||
rdrand_decls()
|
rdrand_decls()
|
||||||
saturation_arithmetic_uniform()
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; broadcast/rotate/shuffle
|
;; broadcast/rotate/shuffle
|
||||||
|
|||||||
@@ -34,7 +34,6 @@ define_prefetches()
|
|||||||
define_shuffles()
|
define_shuffles()
|
||||||
aossoa()
|
aossoa()
|
||||||
rdrand_decls()
|
rdrand_decls()
|
||||||
saturation_arithmetic_uniform()
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rcp
|
;; rcp
|
||||||
|
|||||||
@@ -37,7 +37,6 @@ define_prefetches()
|
|||||||
define_shuffles()
|
define_shuffles()
|
||||||
aossoa()
|
aossoa()
|
||||||
rdrand_decls()
|
rdrand_decls()
|
||||||
saturation_arithmetic_uniform()
|
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding floats
|
;; rounding floats
|
||||||
|
|||||||
@@ -184,73 +184,7 @@ define(`convert32to16', `
|
|||||||
define(`saturation_arithmetic',
|
define(`saturation_arithmetic',
|
||||||
`ifelse(WIDTH, `4', `saturation_arithmetic_vec4()',
|
`ifelse(WIDTH, `4', `saturation_arithmetic_vec4()',
|
||||||
WIDTH, `8', `saturation_arithmetic_vec8()',
|
WIDTH, `8', `saturation_arithmetic_vec8()',
|
||||||
WIDTH, `16', `saturation_arithmetic_vec16()',
|
WIDTH, `16', `saturation_arithmetic_vec16()')')
|
||||||
`saturation_arithmetic_uniform()')')
|
|
||||||
|
|
||||||
;; utility function used by saturation_arithmetic_uniform below. This shouldn't be called by
|
|
||||||
;; target .ll files directly.
|
|
||||||
;; $1: {add,sub} (used in constructing function names)
|
|
||||||
|
|
||||||
define(`saturation_arithmetic_uniform_universal', `
|
|
||||||
declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
||||||
define i8 @__p$1s_i8(i8 %a0, i8 %a1) {
|
|
||||||
%a0_i16 = sext i8 %a0 to i16
|
|
||||||
%a1_i16 = sext i8 %a1 to i16
|
|
||||||
%res = $1 i16 %a0_i16, %a1_i16
|
|
||||||
%over_mask = icmp sgt i16 %res, 127
|
|
||||||
%over_res = select i1 %over_mask, i16 127, i16 %res
|
|
||||||
%under_mask = icmp slt i16 %res, -128
|
|
||||||
%ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res
|
|
||||||
%ret = trunc i16 %ret_i16 to i8
|
|
||||||
ret i8 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
||||||
define i16 @__p$1s_i16(i16 %a0, i16 %a1) {
|
|
||||||
%a0_i32 = sext i16 %a0 to i32
|
|
||||||
%a1_i32 = sext i16 %a1 to i32
|
|
||||||
%res = $1 i32 %a0_i32, %a1_i32
|
|
||||||
%over_mask = icmp sgt i32 %res, 32767
|
|
||||||
%over_res = select i1 %over_mask, i32 32767, i32 %res
|
|
||||||
%under_mask = icmp slt i32 %res, -32768
|
|
||||||
%ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res
|
|
||||||
%ret = trunc i32 %ret_i32 to i16
|
|
||||||
ret i16 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
|
|
||||||
define i8 @__p$1us_i8(i8 %a0, i8 %a1) {
|
|
||||||
%a0_i16 = zext i8 %a0 to i16
|
|
||||||
%a1_i16 = zext i8 %a1 to i16
|
|
||||||
%res = $1 i16 %a0_i16, %a1_i16
|
|
||||||
%over_mask = icmp ugt i16 %res, 255
|
|
||||||
%over_res = select i1 %over_mask, i16 255, i16 %res
|
|
||||||
%under_mask = icmp slt i16 %res, 0
|
|
||||||
%ret_i16 = select i1 %under_mask, i16 0, i16 %over_res
|
|
||||||
%ret = trunc i16 %ret_i16 to i8
|
|
||||||
ret i8 %ret
|
|
||||||
}
|
|
||||||
|
|
||||||
declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
|
|
||||||
define i16 @__p$1us_i16(i16 %a0, i16 %a1) {
|
|
||||||
%a0_i32 = zext i16 %a0 to i32
|
|
||||||
%a1_i32 = zext i16 %a1 to i32
|
|
||||||
%res = $1 i32 %a0_i32, %a1_i32
|
|
||||||
%over_mask = icmp ugt i32 %res, 65535
|
|
||||||
%over_res = select i1 %over_mask, i32 65535, i32 %res
|
|
||||||
%under_mask = icmp slt i32 %res, 0
|
|
||||||
%ret_i32 = select i1 %under_mask, i32 0, i32 %over_res
|
|
||||||
%ret = trunc i32 %ret_i32 to i16
|
|
||||||
ret i16 %ret
|
|
||||||
}
|
|
||||||
')
|
|
||||||
|
|
||||||
;;uniform saturation arithmetic
|
|
||||||
|
|
||||||
define(`saturation_arithmetic_uniform', `
|
|
||||||
saturation_arithmetic_uniform_universal(sub)
|
|
||||||
saturation_arithmetic_uniform_universal(add)
|
|
||||||
')
|
|
||||||
|
|
||||||
;; create vector constant. Used by saturation_arithmetic_novec_universal below.
|
;; create vector constant. Used by saturation_arithmetic_novec_universal below.
|
||||||
|
|
||||||
@@ -278,6 +212,7 @@ ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>',
|
|||||||
;; $1: {add,sub} (used in constructing function names)
|
;; $1: {add,sub} (used in constructing function names)
|
||||||
|
|
||||||
define(`saturation_arithmetic_novec_universal', `
|
define(`saturation_arithmetic_novec_universal', `
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||||
%v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
%v0_i16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||||
%v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
%v1_i16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||||
@@ -290,6 +225,7 @@ define <WIDTH x i8> @__p$1s_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
|||||||
ret <WIDTH x i8> %ret
|
ret <WIDTH x i8> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||||
%v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
%v0_i32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||||
%v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
%v1_i32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||||
@@ -302,6 +238,7 @@ define <WIDTH x i16> @__p$1s_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
|||||||
ret <WIDTH x i16> %ret
|
ret <WIDTH x i16> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||||
%v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
%v0_i16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||||
%v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
%v1_i16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||||
@@ -314,6 +251,7 @@ define <WIDTH x i8> @__p$1us_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
|||||||
ret <WIDTH x i8> %ret
|
ret <WIDTH x i8> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
define <WIDTH x i16> @__p$1us_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||||
%v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
%v0_i32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||||
%v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
%v1_i32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||||
@@ -337,6 +275,7 @@ saturation_arithmetic_novec_universal(add)
|
|||||||
;;4-wide vector saturation arithmetic
|
;;4-wide vector saturation arithmetic
|
||||||
|
|
||||||
define(`saturation_arithmetic_vec4', `
|
define(`saturation_arithmetic_vec4', `
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
|
define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
|
||||||
convert4to16(i8, %0, %v0)
|
convert4to16(i8, %0, %v0)
|
||||||
convert4to16(i8, %1, %v1)
|
convert4to16(i8, %1, %v1)
|
||||||
@@ -345,6 +284,7 @@ define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) {
|
|||||||
ret <4 x i8> %r
|
ret <4 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
|
define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
|
||||||
convert4to8(i16, %0, %v0)
|
convert4to8(i16, %0, %v0)
|
||||||
convert4to8(i16, %1, %v1)
|
convert4to8(i16, %1, %v1)
|
||||||
@@ -353,6 +293,7 @@ define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) {
|
|||||||
ret <4 x i16> %r
|
ret <4 x i16> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
|
define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
|
||||||
convert4to16(i8, %0, %v0)
|
convert4to16(i8, %0, %v0)
|
||||||
convert4to16(i8, %1, %v1)
|
convert4to16(i8, %1, %v1)
|
||||||
@@ -361,6 +302,7 @@ define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) {
|
|||||||
ret <4 x i8> %r
|
ret <4 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
|
define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
|
||||||
convert4to8(i16, %0, %v0)
|
convert4to8(i16, %0, %v0)
|
||||||
convert4to8(i16, %1, %v1)
|
convert4to8(i16, %1, %v1)
|
||||||
@@ -369,6 +311,7 @@ define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) {
|
|||||||
ret <4 x i16> %r
|
ret <4 x i16> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
|
define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
|
||||||
convert4to16(i8, %0, %v0)
|
convert4to16(i8, %0, %v0)
|
||||||
convert4to16(i8, %1, %v1)
|
convert4to16(i8, %1, %v1)
|
||||||
@@ -377,6 +320,7 @@ define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) {
|
|||||||
ret <4 x i8> %r
|
ret <4 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
|
define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
|
||||||
convert4to8(i16, %0, %v0)
|
convert4to8(i16, %0, %v0)
|
||||||
convert4to8(i16, %1, %v1)
|
convert4to8(i16, %1, %v1)
|
||||||
@@ -385,6 +329,7 @@ define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) {
|
|||||||
ret <4 x i16> %r
|
ret <4 x i16> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
|
define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
|
||||||
convert4to16(i8, %0, %v0)
|
convert4to16(i8, %0, %v0)
|
||||||
convert4to16(i8, %1, %v1)
|
convert4to16(i8, %1, %v1)
|
||||||
@@ -393,6 +338,7 @@ define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) {
|
|||||||
ret <4 x i8> %r
|
ret <4 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
|
define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
|
||||||
convert4to8(i16, %0, %v0)
|
convert4to8(i16, %0, %v0)
|
||||||
convert4to8(i16, %1, %v1)
|
convert4to8(i16, %1, %v1)
|
||||||
@@ -405,6 +351,7 @@ define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) {
|
|||||||
;;8-wide vector saturation arithmetic
|
;;8-wide vector saturation arithmetic
|
||||||
|
|
||||||
define(`saturation_arithmetic_vec8', `
|
define(`saturation_arithmetic_vec8', `
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
|
define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
|
||||||
convert8to16(i8, %0, %v0)
|
convert8to16(i8, %0, %v0)
|
||||||
convert8to16(i8, %1, %v1)
|
convert8to16(i8, %1, %v1)
|
||||||
@@ -413,11 +360,13 @@ define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) {
|
|||||||
ret <8 x i8> %r
|
ret <8 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
||||||
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
|
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||||
ret <8 x i16> %res
|
ret <8 x i16> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
|
define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
|
||||||
convert8to16(i8, %0, %v0)
|
convert8to16(i8, %0, %v0)
|
||||||
convert8to16(i8, %1, %v1)
|
convert8to16(i8, %1, %v1)
|
||||||
@@ -426,11 +375,13 @@ define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) {
|
|||||||
ret <8 x i8> %r
|
ret <8 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
||||||
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
|
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||||
ret <8 x i16> %res
|
ret <8 x i16> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
|
define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
|
||||||
convert8to16(i8, %0, %v0)
|
convert8to16(i8, %0, %v0)
|
||||||
convert8to16(i8, %1, %v1)
|
convert8to16(i8, %1, %v1)
|
||||||
@@ -439,11 +390,13 @@ define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) {
|
|||||||
ret <8 x i8> %r
|
ret <8 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
||||||
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
|
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||||
ret <8 x i16> %res
|
ret <8 x i16> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
|
define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
|
||||||
convert8to16(i8, %0, %v0)
|
convert8to16(i8, %0, %v0)
|
||||||
convert8to16(i8, %1, %v1)
|
convert8to16(i8, %1, %v1)
|
||||||
@@ -452,6 +405,7 @@ define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) {
|
|||||||
ret <8 x i8> %r
|
ret <8 x i8> %r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
||||||
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
|
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||||
ret <8 x i16> %res
|
ret <8 x i16> %res
|
||||||
@@ -461,41 +415,49 @@ define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) {
|
|||||||
;;16-wide vector saturation arithmetic
|
;;16-wide vector saturation arithmetic
|
||||||
|
|
||||||
define(`saturation_arithmetic_vec16', `
|
define(`saturation_arithmetic_vec16', `
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
||||||
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||||
ret <16 x i8> %res
|
ret <16 x i8> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
||||||
binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
|
binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
|
||||||
ret <16 x i16> %ret
|
ret <16 x i16> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
||||||
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||||
ret <16 x i8> %res
|
ret <16 x i8> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
||||||
binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
|
binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
|
||||||
ret <16 x i16> %ret
|
ret <16 x i16> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
||||||
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||||
ret <16 x i8> %res
|
ret <16 x i8> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
||||||
binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
|
binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
|
||||||
ret <16 x i16> %ret
|
ret <16 x i16> %ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||||
define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) {
|
||||||
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||||
ret <16 x i8> %res
|
ret <16 x i8> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||||
define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) {
|
||||||
binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
|
binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
|
||||||
ret <16 x i16> %ret
|
ret <16 x i16> %ret
|
||||||
|
|||||||
72
stdlib.ispc
72
stdlib.ispc
@@ -57,6 +57,34 @@
|
|||||||
#error Unknown value of ISPC_MASK_BITS
|
#error Unknown value of ISPC_MASK_BITS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Limits of integral types. */
|
||||||
|
#ifndef INT8_MIN
|
||||||
|
#define INT8_MIN (-128)
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MIN
|
||||||
|
#define INT16_MIN (-32768)
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MIN
|
||||||
|
#define INT32_MIN (-2147483648)
|
||||||
|
#endif
|
||||||
|
#ifndef INT8_MAX
|
||||||
|
#define INT8_MAX (127)
|
||||||
|
#endif
|
||||||
|
#ifndef INT16_MAX
|
||||||
|
#define INT16_MAX (32767)
|
||||||
|
#endif
|
||||||
|
#ifndef INT32_MAX
|
||||||
|
#define INT32_MAX (2147483647)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT8_MAX
|
||||||
|
#define UINT8_MAX (255)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT16_MAX
|
||||||
|
#define UINT16_MAX (65535)
|
||||||
|
#endif
|
||||||
|
#ifndef UINT32_MAX
|
||||||
|
#define UINT32_MAX (4294967295)
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Low level primitives
|
// Low level primitives
|
||||||
|
|
||||||
@@ -4261,7 +4289,12 @@ static inline void fastmath() {
|
|||||||
// saturation arithmetic
|
// saturation arithmetic
|
||||||
|
|
||||||
static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
|
static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) {
|
||||||
return __padds_i8(a, b);
|
uniform unsigned int8 a_unsig = a, b_unsig = b;
|
||||||
|
uniform unsigned int8 result = a_unsig + b_unsig;
|
||||||
|
a_unsig = (a_unsig >> 7) + INT8_MAX;
|
||||||
|
if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
|
||||||
|
result = a_unsig;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
|
static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
|
||||||
@@ -4269,7 +4302,12 @@ static inline varying int8 saturating_add(varying int8 a, varying int8 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
|
static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) {
|
||||||
return __padds_i16(a, b);
|
uniform unsigned int16 a_unsig = a, b_unsig = b;
|
||||||
|
uniform unsigned int16 result = a_unsig + b_unsig;
|
||||||
|
a_unsig = (a_unsig >> 15) + INT16_MAX;
|
||||||
|
if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0)
|
||||||
|
result = a_unsig;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
|
static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
|
||||||
@@ -4278,7 +4316,9 @@ static inline varying int16 saturating_add(varying int16 a, varying int16 b) {
|
|||||||
|
|
||||||
static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a,
|
static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a,
|
||||||
uniform unsigned int8 b) {
|
uniform unsigned int8 b) {
|
||||||
return __paddus_i8(a, b);
|
uniform unsigned int8 result = a + b;
|
||||||
|
result |= (-(uniform int8)(result < a));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
|
static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
|
||||||
@@ -4288,7 +4328,9 @@ static inline varying unsigned int8 saturating_add(varying unsigned int8 a,
|
|||||||
|
|
||||||
static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a,
|
static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a,
|
||||||
uniform unsigned int16 b) {
|
uniform unsigned int16 b) {
|
||||||
return __paddus_i16(a, b);
|
uniform unsigned int16 result = a + b;
|
||||||
|
result |= (-(uniform int16)(result < a));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
|
static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
|
||||||
@@ -4297,7 +4339,12 @@ static inline varying unsigned int16 saturating_add(varying unsigned int16 a,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
|
static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) {
|
||||||
return __psubs_i8(a, b);
|
uniform unsigned int8 a_unsig = a, b_unsig = b;
|
||||||
|
uniform unsigned int8 result = a_unsig - b_unsig;
|
||||||
|
a_unsig = (a_unsig >> 7) + INT8_MAX;
|
||||||
|
if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
|
||||||
|
result = a_unsig;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
|
static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
|
||||||
@@ -4305,7 +4352,12 @@ static inline varying int8 saturating_sub(varying int8 a, varying int8 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
|
static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) {
|
||||||
return __psubs_i16(a, b);
|
uniform unsigned int16 a_unsig = a, b_unsig = b;
|
||||||
|
uniform unsigned int16 result = a_unsig - b_unsig;
|
||||||
|
a_unsig = (a_unsig >> 15) + INT16_MAX;
|
||||||
|
if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0)
|
||||||
|
result = a_unsig;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
|
static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
|
||||||
@@ -4314,7 +4366,9 @@ static inline varying int16 saturating_sub(varying int16 a, varying int16 b) {
|
|||||||
|
|
||||||
static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a,
|
static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a,
|
||||||
uniform unsigned int8 b) {
|
uniform unsigned int8 b) {
|
||||||
return __psubus_i8(a, b);
|
uniform unsigned int8 result = a - b;
|
||||||
|
result &= (-(uniform int8)(result <= a));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
|
static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
|
||||||
@@ -4324,7 +4378,9 @@ static inline varying unsigned int8 saturating_sub(varying unsigned int8 a,
|
|||||||
|
|
||||||
static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a,
|
static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a,
|
||||||
uniform unsigned int16 b) {
|
uniform unsigned int16 b) {
|
||||||
return __psubus_i16(a, b);
|
uniform unsigned int16 result = a - b;
|
||||||
|
result &= (-(uniform int16)(result <= a));
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline varying unsigned int16 saturating_sub(varying unsigned int16 a,
|
static inline varying unsigned int16 saturating_sub(varying unsigned int16 a,
|
||||||
|
|||||||
Reference in New Issue
Block a user