Add code generation of saturation
This commit is contained in:
@@ -45,18 +45,12 @@ include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -66,17 +60,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -86,17 +73,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -106,17 +86,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -45,123 +45,68 @@ include(`target-avx-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -49,18 +49,12 @@ include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -70,17 +64,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -90,17 +77,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -110,17 +90,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -46,123 +46,68 @@ include(`target-sse2-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -46,18 +46,12 @@ include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -67,17 +61,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -87,17 +74,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -107,17 +87,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -49,18 +49,12 @@ include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -70,17 +64,10 @@ define <WIDTH x i16> @__padds_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -90,17 +77,10 @@ define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
@@ -110,17 +90,10 @@ define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16> %a0, <WIDTH x i16> %a1) {
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert8to16(i8, %0, %v0)
|
||||
convert8to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
convert16to8(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -46,123 +46,68 @@ include(`target-sse4-common.ll')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;vector saturation arithmetic
|
||||
|
||||
define <WIDTH x i8> @__padds_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__padds_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__paddus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__paddus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubs_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubs_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i8> @__psubus_vi8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%v0 = shufflevector <4 x i8> %0, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i8> %1, <4 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to16(i8, %0, %v0)
|
||||
convert4to16(i8, %1, %v1)
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert16to4(i8, %r16, %r)
|
||||
ret <WIDTH x i8> %r
|
||||
}
|
||||
|
||||
define <WIDTH x i16> @__psubus_vi16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%v0 = shufflevector <4 x i16> %0, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <4 x i16> %1, <4 x i16> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
convert4to8(i16, %0, %v0)
|
||||
convert4to8(i16, %1, %v1)
|
||||
%r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1)
|
||||
%r = shufflevector <8 x i16> %r16, <8 x i16> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
convert8to4(i16, %r16, %r)
|
||||
ret <WIDTH x i16> %r
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,58 @@ define(`MASK_HIGH_BIT_ON',
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector convertation utilities
|
||||
;; convert 4-wide vector into 8-wide vector
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: 4-wide vector
|
||||
;; $3: 8-wide vector
|
||||
|
||||
define(`convert4to8', `
|
||||
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
')
|
||||
|
||||
define(`convert4to16', `
|
||||
$3 = shufflevector <4 x $1> $2, <4 x $1> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
')
|
||||
|
||||
define(`convert8to16', `
|
||||
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
')
|
||||
|
||||
;; convert 4-wide vector into 8-wide vector
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: 8-wide vector
|
||||
;; $3: 4-wide vector
|
||||
|
||||
define(`convert8to4', `
|
||||
$3 = shufflevector <8 x $1> $2, <8 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
')
|
||||
|
||||
|
||||
define(`convert16to4', `
|
||||
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
')
|
||||
|
||||
define(`convert16to8', `
|
||||
$3 = shufflevector <16 x $1> $2, <16 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector deconstruction utilities
|
||||
;; split 8-wide vector into 2 4-wide vectors
|
||||
;;
|
||||
|
||||
Reference in New Issue
Block a user