diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll index df04187c..cc5644bc 100644 --- a/builtins/target-generic-16.ll +++ b/builtins/target-generic-16.ll @@ -31,4 +31,4 @@ define(`WIDTH',`16') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll index 5f89bcdf..8eb31c48 100644 --- a/builtins/target-generic-32.ll +++ b/builtins/target-generic-32.ll @@ -31,3 +31,4 @@ define(`WIDTH',`32') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll index e43f45c5..d80c5b91 100644 --- a/builtins/target-generic-4.ll +++ b/builtins/target-generic-4.ll @@ -31,4 +31,4 @@ define(`WIDTH',`4') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/target-generic-64.ll b/builtins/target-generic-64.ll index 09443f8e..6a044c41 100644 --- a/builtins/target-generic-64.ll +++ b/builtins/target-generic-64.ll @@ -31,3 +31,4 @@ define(`WIDTH',`64') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll index 6b87509d..4353658c 100644 --- a/builtins/target-generic-8.ll +++ b/builtins/target-generic-8.ll @@ -31,4 +31,4 @@ define(`WIDTH',`8') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/util.m4 b/builtins/util.m4 index 6f36f71e..de48a0a1 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -187,124 +187,151 @@ define(`saturation_arithmetic', WIDTH, `16', `saturation_arithmetic_vec16()', `saturation_arithmetic_uniform()')') -;;uniform saturation arithmetic +;; utility function used by saturation_arithmetic_uniform below. This shouldn't be called by +;; target .ll files directly. +;; $1: {add,sub} (used in constructing function names) -define(`saturation_arithmetic_uniform', ` -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) +define(`saturation_arithmetic_uniform_universal', ` +declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__p$1s_i8(i8 %a0, i8 %a1) { + %a0_i16 = sext i8 %a0 to i16 + %a1_i16 = sext i8 %a1 to i16 + %res = $1 i16 %a0_i16, %a1_i16 + %over_mask = icmp sgt i16 %res, 127 + %over_res = select i1 %over_mask, i16 127, i16 %res + %under_mask = icmp slt i16 %res, -128 + %ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res + %ret = trunc i16 %ret_i16 to i8 ret i8 %ret } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) +declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__p$1s_i16(i16 %a0, i16 %a1) { + %a0_i32 = sext i16 %a0 to i32 + %a1_i32 = sext i16 %a1 to i32 + %res = $1 i32 %a0_i32, %a1_i32 + %over_mask = icmp sgt i32 %res, 32767 + %over_res = select i1 %over_mask, i32 32767, i32 %res + %under_mask = icmp slt i32 %res, -32768 + %ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res + %ret = trunc i32 %ret_i32 to i16 ret i16 %ret } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) +declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__p$1us_i8(i8 %a0, i8 %a1) { + %a0_i16 = zext i8 %a0 to i16 + %a1_i16 = zext i8 %a1 to i16 + %res = $1 i16 %a0_i16, %a1_i16 + %over_mask = icmp ugt i16 %res, 255 + %over_res = select i1 %over_mask, i16 255, i16 %res + %under_mask = icmp slt i16 %res, 0 + %ret_i16 = select i1 %under_mask, i16 0, i16 %over_res + %ret = trunc i16 %ret_i16 to i8 ret i8 %ret } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) +declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__p$1us_i16(i16 %a0, i16 %a1) { + %a0_i32 = zext i16 %a0 to i32 + %a1_i32 = zext i16 %a1 to i32 + %res = $1 i32 %a0_i32, %a1_i32 + %over_mask = icmp ugt i32 %res, 65535 + %over_res = select i1 %over_mask, i32 65535, i32 %res + %under_mask = icmp slt i32 %res, 0 + %ret_i32 = select i1 %under_mask, i32 0, i32 %over_res + %ret = trunc i32 %ret_i32 to i16 ret i16 %ret } ') -;;no vector saturation arithmetic +;;uniform saturation arithmetic + +define(`saturation_arithmetic_uniform', ` +saturation_arithmetic_uniform_universal(sub) +saturation_arithmetic_uniform_universal(add) +') + +;; create vector constant. Used by saturation_arithmetic_novec_universal below. + +define(`const_vector', ` +ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + `<$1 $2>')') + +;; utility function used by saturation_arithmetic_novec below. This shouldn't be called by +;; target .ll files directly. +;; $1: {add,sub} (used in constructing function names) + +define(`saturation_arithmetic_novec_universal', ` +define @__p$1s_vi8(, ) { + %v0_i16 = sext %0 to + %v1_i16 = sext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp sgt %res, const_vector(i16, 127) + %over_res = select %over_mask, const_vector(i16, 127), %res + %under_mask = icmp slt %res, const_vector(i16, -128) + %ret_i16 = select %under_mask, const_vector(i16, -128), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1s_vi16(, ) { + %v0_i32 = sext %0 to + %v1_i32 = sext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp sgt %res, const_vector(i32, 32767) + %over_res = select %over_mask, const_vector(i32, 32767), %res + %under_mask = icmp slt %res, const_vector(i32, -32768) + %ret_i32 = select %under_mask, const_vector(i32, -32768), %over_res + %ret = trunc %ret_i32 to + ret %ret +} + +define @__p$1us_vi8(, ) { + %v0_i16 = zext %0 to + %v1_i16 = zext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp ugt %res, const_vector(i16, 255) + %over_res = select %over_mask, const_vector(i16, 255), %res + %under_mask = icmp slt %res, const_vector(i16, 0) + %ret_i16 = select %under_mask, const_vector(i16, 0), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1us_vi16(, ) { + %v0_i32 = zext %0 to + %v1_i32 = zext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp ugt %res, const_vector(i32, 65535) + %over_res = select %over_mask, const_vector(i32, 65535), %res + %under_mask = icmp slt %res, const_vector(i32, 0) + %ret_i32 = select %under_mask, const_vector(i32, 0), %over_res + %ret = trunc %ret_i32 to + ret %ret +} +') + +;; implementation for targets which doesn't have h/w instructions define(`saturation_arithmetic_novec', ` -define @__padds_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} +saturation_arithmetic_novec_universal(sub) +saturation_arithmetic_novec_universal(add) ') ;;4-wide vector saturation arithmetic