From 4faff1a63cbe9970f895d96518583897f2f8eb66 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Sat, 30 Nov 2013 10:48:18 +0400 Subject: [PATCH] structural change --- builtins/target-avx-common.ll | 52 +----- builtins/target-avx-x2.ll | 44 +---- builtins/target-avx.ll | 56 +----- builtins/target-avx1-i64x4base.ll | 68 +------- builtins/target-sse2-common.ll | 52 +----- builtins/target-sse2-x2.ll | 56 +----- builtins/target-sse2.ll | 68 +------- builtins/target-sse4-16.ll | 56 +----- builtins/target-sse4-8.ll | 44 +---- builtins/target-sse4-common.ll | 52 +----- builtins/target-sse4-x2.ll | 56 +----- builtins/target-sse4.ll | 68 +------- builtins/util.m4 | 273 ++++++++++++++++++++++++++++++ 13 files changed, 285 insertions(+), 660 deletions(-) diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index d5eac54f..32157a77 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -40,57 +40,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 694afe35..cde63e7b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -40,52 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec16() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__padds_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret %ret -} - -define @__paddus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__paddus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret %ret -} - -define @__psubs_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubs_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret %ret -} - -define @__psubus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index c56ec67d..8f20bfed 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -40,64 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index de26a29e..a2d292f2 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -40,76 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index a1fec300..b5c5559c 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -34,57 +34,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index d59513b3..b4b52d91 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -44,64 +44,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse2-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 11c51f70..bdf6f848 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -41,76 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-sse2-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 156cccab..1c0b045a 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -41,64 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 6f00aa83..49351856 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -41,52 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec16() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__padds_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret %ret -} - -define @__paddus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__paddus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret %ret -} - -define @__psubs_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubs_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret %ret -} - -define @__psubus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index e33dbf01..8eeaa413 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -37,57 +37,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 1f4f8332..2cd0ea4d 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -44,64 +44,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 2f6ebf6a..96effe39 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -41,76 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/util.m4 b/builtins/util.m4 index 5f75d23a..0d5ed2de 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -77,6 +77,42 @@ define(`convert8to16', ` i32 undef, i32 undef, i32 undef, i32 undef> ') +define(`convert4to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert8to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert16to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + ;; convert 4-wide vector into 8-wide vector ;; ;; $1: vector element type @@ -99,6 +135,243 @@ define(`convert16to8', ` <8 x i32> ') +define(`convert32to4', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <4 x i32> +') + +define(`convert32to8', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <8 x i32> +') + +define(`convert32to16', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;saturation arithmetic +;;scalar saturation arithmetic + +define(`saturation_arithmetic_scalar', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} +') + +;;4-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec4', ` +define @__padds_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__padds_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__paddus_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__paddus_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__psubs_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__psubs_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__psubus_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__psubus_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} +') + +;;8-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec8', ` +define @__padds_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} +') + +;;16-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec16', ` +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector deconstruction utilities