Some cganges for avx1 and avx1.1 in saturation

This commit is contained in:
Vsevolod Livinskij
2013-11-29 03:45:25 +04:00
parent 42c148bf75
commit bec6662338
4 changed files with 298 additions and 0 deletions

View File

@@ -41,6 +41,57 @@ define_prefetches()
define_shuffles()
aossoa()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;scalar saturation arithmetic
declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define i8 @__padds_i8(i8 %a0, i8 %a1) {
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1)
ret i8 %ret
}
declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define i16 @__padds_i16(i16 %a0, i16 %a1) {
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
ret i16 %ret
}
declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
define i8 @__paddus_i8(i8 %a0, i8 %a1) {
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1)
ret i8 %ret
}
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
define i16 @__paddus_i16(i16 %a0, i16 %a1) {
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
ret i16 %ret
}
declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define i8 @__psubs_i8(i8 %a0, i8 %a1) {
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1)
ret i8 %ret
}
declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define i16 @__psubs_i16(i16 %a0, i16 %a1) {
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
ret i16 %ret
}
declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
define i8 @__psubus_i8(i8 %a0, i8 %a1) {
sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1)
ret i8 %ret
}
declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
define i16 @__psubus_i16(i16 %a0, i16 %a1) {
sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
ret i16 %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats

View File

@@ -43,6 +43,49 @@ int64minmax()
include(`target-avx-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;vector saturation arithmetic
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
%res = call <WIDTH x i8> @llvm.x86.sse2.padds.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <WIDTH x i8> %res
}
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1)
ret <WIDTH x i16> %ret
}
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
%res = call <WIDTH x i8> @llvm.x86.sse2.paddus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <WIDTH x i8> %res
}
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1)
ret <WIDTH x i16> %ret
}
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
%res = call <WIDTH x i8> @llvm.x86.sse2.psubs.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <WIDTH x i8> %res
}
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1)
ret <WIDTH x i16> %ret
}
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8> %a0, <WIDTH x i8> %a1) {
%res = call <WIDTH x i8> @llvm.x86.sse2.psubus.b(<WIDTH x i8> %a0, <WIDTH x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <WIDTH x i8> %res
}
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1)
ret <WIDTH x i16> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -43,6 +43,88 @@ int64minmax()
include(`target-avx-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;vector saturation arithmetic
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
%res = call <WIDTH x i16> @llvm.x86.sse2.padds.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
ret <WIDTH x i16> %res
}
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
%res = call <WIDTH x i16> @llvm.x86.sse2.paddus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
ret <WIDTH x i16> %res
}
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
%res = call <WIDTH x i16> @llvm.x86.sse2.psubs.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
ret <WIDTH x i16> %res
}
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
%res = call <WIDTH x i16> @llvm.x86.sse2.psubus.w(<WIDTH x i16> %a0, <WIDTH x i16> %a1)
ret <WIDTH x i16> %res
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -43,6 +43,128 @@ int64minmax()
include(`target-avx-common.ll')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;vector saturation arithmetic
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i16> %r
}
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i16> %r
}
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i16> %r
}
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i8> %r
}
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <WIDTH x i16> %r
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp