diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index a5a497d0..c56ec67d 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -45,18 +45,12 @@ include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -66,17 +60,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -86,17 +73,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -106,17 +86,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index 831ae0e5..de26a29e 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -45,123 +45,68 @@ include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 0f3eb275..d59513b3 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -49,18 +49,12 @@ include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -70,17 +64,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -90,17 +77,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -110,17 +90,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 1409e31d..11c51f70 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -46,123 +46,68 @@ include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 0ba62ac9..156cccab 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -46,18 +46,12 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -67,17 +61,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -87,17 +74,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -107,17 +87,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 5c330e51..1f4f8332 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -49,18 +49,12 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -70,17 +64,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -90,17 +77,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -110,17 +90,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 0478ab2c..2f6ebf6a 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -46,123 +46,68 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/util.m4 b/builtins/util.m4 index e1c9bf97..5f75d23a 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,58 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector convertation utilities +;; convert 4-wide vector into 8-wide vector +;; +;; $1: vector element type +;; $2: 4-wide vector +;; $3: 8-wide vector + +define(`convert4to8', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <8 x i32> +') + +define(`convert4to16', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <16 x i32> +') + +define(`convert8to16', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <16 x i32> +') + +;; convert 4-wide vector into 8-wide vector +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: 4-wide vector + +define(`convert8to4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + + +define(`convert16to4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <4 x i32> +') + +define(`convert16to8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors ;;