From 42c148bf75bf8efc498370c51a90e71010bdb725 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Fri, 29 Nov 2013 03:33:40 +0400 Subject: [PATCH] Changes for sse2 and sse4 in saturation --- builtins/target-avx-common.ll | 97 -------------------------- builtins/target-sse2-common.ll | 48 +------------ builtins/target-sse2-x2.ll | 82 ++++++++++++++++++++++ builtins/target-sse2.ll | 122 +++++++++++++++++++++++++++++++++ builtins/target-sse4-16.ll | 82 ++++++++++++++++++++++ builtins/target-sse4-8.ll | 43 ++++++++++++ builtins/target-sse4-common.ll | 49 +------------ builtins/target-sse4-x2.ll | 82 ++++++++++++++++++++++ builtins/target-sse4.ll | 122 +++++++++++++++++++++++++++++++++ 9 files changed, 535 insertions(+), 192 deletions(-) diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index d47145f2..dcca74f0 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,103 +41,6 @@ define_prefetches() define_shuffles() aossoa() -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic - -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index a1e6f915..a1fec300 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -36,7 +36,7 @@ aossoa() rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic +;;scalar saturation arithmetic declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { @@ -44,94 +44,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) { ret i8 %ret } -define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 77bf1a9d..0f3eb275 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -47,6 +47,88 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e42d4990..1409e31d 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -44,6 +44,128 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 72b81ff0..0ba62ac9 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -44,6 +44,88 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 69b355e3..6f00aa83 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -44,6 +44,49 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic + +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 70acca63..e33dbf01 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -39,7 +39,7 @@ aossoa() rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic +;;scalar saturation arithmetic declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { @@ -47,95 +47,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) { ret i8 %ret } -define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 842db53f..5c330e51 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -47,6 +47,88 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 16177b47..0478ab2c 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -44,6 +44,128 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines