Some cganges for avx1 and avx1.1 in saturation
This commit is contained in:
@@ -41,6 +41,57 @@ define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;scalar saturation arithmetic
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
define i8 @__padds_i8(i8 %a0, i8 %a1) {
|
||||
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
|
||||
ret i8 %ret
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
define i16 @__padds_i16(i16 %a0, i16 %a1) {
|
||||
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
|
||||
ret i16 %ret
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
define i8 @__paddus_i8(i8 %a0, i8 %a1) {
|
||||
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
|
||||
ret i8 %ret
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
define i16 @__paddus_i16(i16 %a0, i16 %a1) {
|
||||
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
|
||||
ret i16 %ret
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
define i8 @__psubs_i8(i8 %a0, i8 %a1) {
|
||||
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
|
||||
ret i8 %ret
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
define i16 @__psubs_i16(i16 %a0, i16 %a1) {
|
||||
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
|
||||
ret i16 %ret
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
define i8 @__psubus_i8(i8 %a0, i8 %a1) {
|
||||
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
|
||||
ret i8 %ret
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
define i16 @__psubus_i16(i16 %a0, i16 %a1) {
|
||||
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
|
||||
ret i16 %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
|
||||
@@ -43,6 +43,49 @@ int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
|
||||
%res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||
ret <WIDTH x i8> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
|
||||
ret <WIDTH x i16> %ret
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
|
||||
%res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||
ret <WIDTH x i8> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
|
||||
ret <WIDTH x i16> %ret
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
|
||||
%res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||
ret <WIDTH x i8> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
|
||||
ret <WIDTH x i16> %ret
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
|
||||
%res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
|
||||
ret <WIDTH x i8> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
|
||||
ret <WIDTH x i16> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -43,6 +43,88 @@ int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
%res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
|
||||
ret <WIDTH x i16> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
%res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
|
||||
ret <WIDTH x i16> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
%res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
|
||||
ret <WIDTH x i16> %res
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
%res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
|
||||
ret <WIDTH x i16> %res
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
@@ -43,6 +43,128 @@ int64minmax()
|
||||
|
||||
include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
|
||||
Reference in New Issue
Block a user