diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcca74f0..d5eac54f 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,6 +41,57 @@ define_prefetches() define_shuffles() aossoa() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;scalar saturation arithmetic + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index f8fd5cd5..694afe35 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -43,6 +43,49 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic + +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e98a3843..a5a497d0 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -43,6 +43,88 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index e1832030..831ae0e5 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -43,6 +43,128 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp