From 19f73b2ede4fc2142299af13a7f24d34b6cf153a Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Mon, 25 Nov 2013 19:16:02 +0400 Subject: [PATCH 01/16] uniform signed/unsigned int8/16 --- builtins.cpp | 9 ++++++ builtins/target-sse2-common.ll | 58 ++++++++++++++++++++++++++++++++++ stdlib.ispc | 35 ++++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/builtins.cpp b/builtins.cpp index 2afd92d9..c001318a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -488,12 +488,21 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", + "__padds_i8", + "__padds_i16", + "__vpadds_i8", + "__paddus_i8", + "__paddus_i16", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", + "__psubs_i8", + "__psubs_i16", + "__psubus_i8", + "__psubus_i16", "__rcp_uniform_float", "__rcp_varying_float", "__rdrand_i16", diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index ad1d88bc..070912ea 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -35,6 +35,64 @@ define_shuffles() aossoa() rdrand_decls() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) { + ; CHECK: vpaddsb + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/stdlib.ispc b/stdlib.ispc index 6768594b..464da5d4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4257,6 +4257,41 @@ static inline void fastmath() { __fastmath(); } +/////////////////////////////////////////////////////////////////////////// +// saturation arithmetic + +static inline uniform int8 padds(uniform int8 a, uniform int8 b) { + return __padds_i8(a, b); +} + +static inline uniform int16 padds(uniform int16 a, uniform int16 b) { + return __padds_i16(a, b); +} + +static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) { + return __paddus_i8(a, b); +} + +static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) { + return __paddus_i16(a, b); +} + +static inline uniform int8 psubs(uniform int8 a, uniform int8 b) { + return __psubs_i8(a, b); +} + +static inline uniform int16 psubs(uniform int16 a, uniform int16 b) { + return __psubs_i16(a, b); +} + +static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) { + return __psubus_i8(a, b); +} + +static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) { + return __psubus_i16(a, b); +} + /////////////////////////////////////////////////////////////////////////// // rdrand From 35a4d1b3a27b88dce6f6c7e9e6eeef7bc5fbdf99 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Wed, 27 Nov 2013 00:55:57 +0400 Subject: [PATCH 02/16] Add some AVX2 intrinsics --- builtins.cpp | 9 +++- builtins/target-avx-common.ll | 96 +++++++++++++++++++++++++++++++++ builtins/target-sse2-common.ll | 55 ++++++++++++++++--- builtins/target-sse4-common.ll | 98 ++++++++++++++++++++++++++++++++++ stdlib.ispc | 47 ++++++++++++++-- 5 files changed, 292 insertions(+), 13 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index c001318a..c6828a00 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -490,9 +490,12 @@ lSetInternalFunctions(llvm::Module *module) { "__packed_store_active", "__padds_i8", "__padds_i16", - "__vpadds_i8", + "__padds_vi8", + "__padds_vi16", "__paddus_i8", "__paddus_i16", + "__paddus_vi8", + "__paddus_vi16", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", @@ -501,8 +504,12 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__psubs_i8", "__psubs_i16", + "__psubs_vi8", + "__psubs_vi16", "__psubus_i8", "__psubus_i16", + "__psubus_vi8", + "__psubus_vi16", "__rcp_uniform_float", "__rcp_varying_float", "__rdrand_i16", diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index 1c467476..d47145f2 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,7 +41,103 @@ define_prefetches() define_shuffles() aossoa() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index 070912ea..a1e6f915 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -37,62 +37,101 @@ rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; saturation arithmetic + declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) { - ; CHECK: vpaddsb - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - -;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 50dd0582..70acca63 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -38,6 +38,104 @@ define_shuffles() aossoa() rdrand_decls() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/stdlib.ispc b/stdlib.ispc index 464da5d4..5b3d144c 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4264,34 +4264,73 @@ static inline uniform int8 padds(uniform int8 a, uniform int8 b) { return __padds_i8(a, b); } +static inline varying int8 padds(varying int8 a, varying int8 b) { + return __padds_vi8(a, b); +} + static inline uniform int16 padds(uniform int16 a, uniform int16 b) { return __padds_i16(a, b); } -static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) { +static inline varying int16 padds(varying int16 a, varying int16 b) { + return __padds_vi16(a, b); +} + +static inline uniform unsigned int8 paddus(uniform unsigned int8 a, + uniform unsigned int8 b) { return __paddus_i8(a, b); } -static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) { +static inline varying unsigned int8 paddus(varying unsigned int8 a, + varying unsigned int8 b) { + return __paddus_vi8(a, b); +} + +static inline uniform unsigned int16 paddus(uniform unsigned int16 a, + unsigned uniform int16 b) { return __paddus_i16(a, b); } +static inline varying unsigned int16 paddus(varying unsigned int16 a, + unsigned varying int16 b) { + return __paddus_vi16(a, b); +} + static inline uniform int8 psubs(uniform int8 a, uniform int8 b) { return __psubs_i8(a, b); } +static inline varying int8 psubs(varying int8 a, varying int8 b) { + return __psubs_vi8(a, b); +} + static inline uniform int16 psubs(uniform int16 a, uniform int16 b) { return __psubs_i16(a, b); } -static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) { +static inline varying int16 psubs(varying int16 a, varying int16 b) { + return __psubs_vi16(a, b); +} + +static inline uniform unsigned int8 psubus(uniform unsigned int8 a, + uniform unsigned int8 b) { return __psubus_i8(a, b); } -static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) { +static inline varying unsigned int8 psubus(varying unsigned int8 a, + varying unsigned int8 b) { + return __psubus_vi8(a, b); +} + +static inline uniform unsigned int16 psubus(uniform unsigned int16 a, + unsigned uniform int16 b) { return __psubus_i16(a, b); } +static inline varying unsigned int16 psubus(varying unsigned int16 a, + unsigned varying int16 b) { + return __psubus_vi16(a, b); +} /////////////////////////////////////////////////////////////////////////// // rdrand From 42c148bf75bf8efc498370c51a90e71010bdb725 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Fri, 29 Nov 2013 03:33:40 +0400 Subject: [PATCH 03/16] Changes for sse2 and sse4 in saturation --- builtins/target-avx-common.ll | 97 -------------------------- builtins/target-sse2-common.ll | 48 +------------ builtins/target-sse2-x2.ll | 82 ++++++++++++++++++++++ builtins/target-sse2.ll | 122 +++++++++++++++++++++++++++++++++ builtins/target-sse4-16.ll | 82 ++++++++++++++++++++++ builtins/target-sse4-8.ll | 43 ++++++++++++ builtins/target-sse4-common.ll | 49 +------------ builtins/target-sse4-x2.ll | 82 ++++++++++++++++++++++ builtins/target-sse4.ll | 122 +++++++++++++++++++++++++++++++++ 9 files changed, 535 insertions(+), 192 deletions(-) diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index d47145f2..dcca74f0 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,103 +41,6 @@ define_prefetches() define_shuffles() aossoa() -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic - -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} - - -declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1) - ret i8 %ret -} - -define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) { - %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} - - -declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1) - ret i16 %ret -} - -define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { - %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index a1e6f915..a1fec300 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -36,7 +36,7 @@ aossoa() rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic +;;scalar saturation arithmetic declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { @@ -44,94 +44,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) { ret i8 %ret } -define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 77bf1a9d..0f3eb275 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -47,6 +47,88 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index e42d4990..1409e31d 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -44,6 +44,128 @@ int64minmax() include(`target-sse2-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 72b81ff0..0ba62ac9 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -44,6 +44,88 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 69b355e3..6f00aa83 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -44,6 +44,49 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic + +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 70acca63..e33dbf01 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -39,7 +39,7 @@ aossoa() rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; saturation arithmetic +;;scalar saturation arithmetic declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { @@ -47,95 +47,48 @@ define i8 @__padds_i8(i8 %a0, i8 %a1) { ret i8 %ret } -define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - - declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } -define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { - %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } -define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { - %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 842db53f..5c330e51 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -47,6 +47,88 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 16177b47..0478ab2c 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -44,6 +44,128 @@ int64minmax() include(`target-sse4-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines From bec66623383e44deef6ef80209a97f33588c65b1 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Fri, 29 Nov 2013 03:45:25 +0400 Subject: [PATCH 04/16] Some cganges for avx1 and avx1.1 in saturation --- builtins/target-avx-common.ll | 51 +++++++++++++ builtins/target-avx-x2.ll | 43 +++++++++++ builtins/target-avx.ll | 82 ++++++++++++++++++++ builtins/target-avx1-i64x4base.ll | 122 ++++++++++++++++++++++++++++++ 4 files changed, 298 insertions(+) diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcca74f0..d5eac54f 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,6 +41,57 @@ define_prefetches() define_shuffles() aossoa() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;scalar saturation arithmetic + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index f8fd5cd5..694afe35 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -43,6 +43,49 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic + +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index e98a3843..a5a497d0 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -43,6 +43,88 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index e1832030..831ae0e5 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -43,6 +43,128 @@ int64minmax() include(`target-avx-common.ll') +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;vector saturation arithmetic +define @__padds_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__padds_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__paddus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubs_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi8(, ) { + %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, + <16 x i32> + %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <4 x i32> + ret %r +} + +define @__psubus_vi16(, ) { + %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, + <8 x i32> + %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, + <8 x i32> + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + %r = shufflevector <8 x i16> %r16, <8 x i16> undef, + <4 x i32> + ret %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp From 4c330bc38bdfaace8ac8bfcac7cea92b1ca2ebdb Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Fri, 29 Nov 2013 18:40:04 +0400 Subject: [PATCH 05/16] Add code generation of saturation --- builtins/target-avx.ll | 53 ++++----------- builtins/target-avx1-i64x4base.ll | 105 +++++++----------------------- builtins/target-sse2-x2.ll | 53 ++++----------- builtins/target-sse2.ll | 105 +++++++----------------------- builtins/target-sse4-16.ll | 53 ++++----------- builtins/target-sse4-x2.ll | 53 ++++----------- builtins/target-sse4.ll | 105 +++++++----------------------- builtins/util.m4 | 52 +++++++++++++++ 8 files changed, 179 insertions(+), 400 deletions(-) diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index a5a497d0..c56ec67d 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -45,18 +45,12 @@ include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -66,17 +60,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -86,17 +73,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -106,17 +86,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index 831ae0e5..de26a29e 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -45,123 +45,68 @@ include(`target-avx-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 0f3eb275..d59513b3 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -49,18 +49,12 @@ include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -70,17 +64,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -90,17 +77,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -110,17 +90,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 1409e31d..11c51f70 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -46,123 +46,68 @@ include(`target-sse2-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 0ba62ac9..156cccab 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -46,18 +46,12 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -67,17 +61,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -87,17 +74,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -107,17 +87,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 5c330e51..1f4f8332 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -49,18 +49,12 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -70,17 +64,10 @@ define @__padds_vi16( %a0, %a1) { } define @__paddus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -90,17 +77,10 @@ define @__paddus_vi16( %a0, %a1) { } define @__psubs_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } @@ -110,17 +90,10 @@ define @__psubs_vi16( %a0, %a1) { } define @__psubus_vi8(, ) { - %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, - <16 x i32> - %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, - <16 x i32> + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <8 x i32> + convert16to8(i8, %r16, %r) ret %r } diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 0478ab2c..2f6ebf6a 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -46,123 +46,68 @@ include(`target-sse4-common.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;vector saturation arithmetic + define @__padds_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__padds_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__paddus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__paddus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubs_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubs_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } define @__psubus_vi8(, ) { - %v0 = shufflevector <4 x i8> %0, <4 x i8> undef, - <16 x i32> - %v1 = shufflevector <4 x i8> %1, <4 x i8> undef, - <16 x i32> + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - %r = shufflevector <16 x i8> %r16, <16 x i8> undef, - <4 x i32> + convert16to4(i8, %r16, %r) ret %r } define @__psubus_vi16(, ) { - %v0 = shufflevector <4 x i16> %0, <4 x i16> undef, - <8 x i32> - %v1 = shufflevector <4 x i16> %1, <4 x i16> undef, - <8 x i32> + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - %r = shufflevector <8 x i16> %r16, <8 x i16> undef, - <4 x i32> + convert8to4(i16, %r16, %r) ret %r } diff --git a/builtins/util.m4 b/builtins/util.m4 index e1c9bf97..5f75d23a 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,58 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector convertation utilities +;; convert 4-wide vector into 8-wide vector +;; +;; $1: vector element type +;; $2: 4-wide vector +;; $3: 8-wide vector + +define(`convert4to8', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <8 x i32> +') + +define(`convert4to16', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <16 x i32> +') + +define(`convert8to16', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <16 x i32> +') + +;; convert 4-wide vector into 8-wide vector +;; +;; $1: vector element type +;; $2: 8-wide vector +;; $3: 4-wide vector + +define(`convert8to4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + + +define(`convert16to4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <4 x i32> +') + +define(`convert16to8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors ;; From 4faff1a63cbe9970f895d96518583897f2f8eb66 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Sat, 30 Nov 2013 10:48:18 +0400 Subject: [PATCH 06/16] structural change --- builtins/target-avx-common.ll | 52 +----- builtins/target-avx-x2.ll | 44 +---- builtins/target-avx.ll | 56 +----- builtins/target-avx1-i64x4base.ll | 68 +------- builtins/target-sse2-common.ll | 52 +----- builtins/target-sse2-x2.ll | 56 +----- builtins/target-sse2.ll | 68 +------- builtins/target-sse4-16.ll | 56 +----- builtins/target-sse4-8.ll | 44 +---- builtins/target-sse4-common.ll | 52 +----- builtins/target-sse4-x2.ll | 56 +----- builtins/target-sse4.ll | 68 +------- builtins/util.m4 | 273 ++++++++++++++++++++++++++++++ 13 files changed, 285 insertions(+), 660 deletions(-) diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index d5eac54f..32157a77 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -40,57 +40,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index 694afe35..cde63e7b 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -40,52 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec16() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__padds_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret %ret -} - -define @__paddus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__paddus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret %ret -} - -define @__psubs_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubs_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret %ret -} - -define @__psubus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index c56ec67d..8f20bfed 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -40,64 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index de26a29e..a2d292f2 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -40,76 +40,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-avx-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index a1fec300..b5c5559c 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -34,57 +34,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index d59513b3..b4b52d91 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -44,64 +44,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse2-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 11c51f70..bdf6f848 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -41,76 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-sse2-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 156cccab..1c0b045a 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -41,64 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 6f00aa83..49351856 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -41,52 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec16() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__padds_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret %ret -} - -define @__paddus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__paddus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret %ret -} - -define @__psubs_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubs_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret %ret -} - -define @__psubus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res -} - -define @__psubus_vi16( %a0, %a1) { - binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret %ret -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index e33dbf01..8eeaa413 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -37,57 +37,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;scalar saturation arithmetic - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret i16 %ret -} +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 1f4f8332..2cd0ea4d 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -44,64 +44,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec8() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res -} - -define @__paddus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res -} - -define @__psubs_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res -} - -define @__psubus_vi8(, ) { - convert8to16(i8, %0, %v0) - convert8to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to8(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 2f6ebf6a..96effe39 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -41,76 +41,10 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic_vec4() include(`target-sse4-common.ll') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;vector saturation arithmetic - -define @__padds_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert4to16(i8, %0, %v0) - convert4to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to4(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert4to8(i16, %0, %v0) - convert4to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to4(i16, %r16, %r) - ret %r -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; half conversion routines diff --git a/builtins/util.m4 b/builtins/util.m4 index 5f75d23a..0d5ed2de 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -77,6 +77,42 @@ define(`convert8to16', ` i32 undef, i32 undef, i32 undef, i32 undef> ') +define(`convert4to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert8to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert16to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + ;; convert 4-wide vector into 8-wide vector ;; ;; $1: vector element type @@ -99,6 +135,243 @@ define(`convert16to8', ` <8 x i32> ') +define(`convert32to4', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <4 x i32> +') + +define(`convert32to8', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <8 x i32> +') + +define(`convert32to16', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;saturation arithmetic +;;scalar saturation arithmetic + +define(`saturation_arithmetic_scalar', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} +') + +;;4-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec4', ` +define @__padds_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__padds_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__paddus_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__paddus_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__psubs_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__psubs_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} + +define @__psubus_vi8(, ) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret %r +} + +define @__psubus_vi16(, ) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret %r +} +') + +;;8-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec8', ` +define @__padds_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__padds_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.w( %a0, %a1) + ret %res +} + +define @__paddus_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__paddus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) + ret %res +} + +define @__psubs_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__psubs_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) + ret %res +} + +define @__psubus_vi8(, ) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret %r +} + +define @__psubus_vi16( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) + ret %res +} +') + +;;16-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec16', ` +define @__padds_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__padds_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret %ret +} + +define @__paddus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__paddus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret %ret +} + +define @__psubs_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubs_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret %ret +} + +define @__psubus_vi8( %a0, %a1) { + %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] + ret %res +} + +define @__psubus_vi16( %a0, %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret %ret +} +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector deconstruction utilities From 65768c20aec633b7c9f33b8c150169aeaca82c49 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Thu, 5 Dec 2013 00:34:14 +0400 Subject: [PATCH 07/16] Added tests for saturation and some fixes for generic and avx target --- builtins/target-avx.ll | 1 - builtins/target-avx1.ll | 1 + builtins/target-avx11.ll | 1 + builtins/target-avx2.ll | 1 + builtins/target-generic-1.ll | 2 + builtins/target-generic-16.ll | 2 +- builtins/target-generic-4.ll | 2 +- builtins/target-generic-8.ll | 2 +- builtins/target-generic-common.ll | 1 + builtins/util.m4 | 102 +++++++++++++++++++++++++++++- stdlib.ispc | 8 +-- tests/padds_i16.ispc | 11 ++++ tests/padds_i8.ispc | 11 ++++ tests/padds_vi16.ispc | 11 ++++ tests/padds_vi8.ispc | 11 ++++ tests/paddus_i16.ispc | 11 ++++ tests/paddus_i8.ispc | 11 ++++ tests/paddus_vi16.ispc | 11 ++++ tests/paddus_vi8.ispc | 11 ++++ tests/psubs_i16.ispc | 11 ++++ tests/psubs_i8.ispc | 11 ++++ tests/psubs_vi16.ispc | 11 ++++ tests/psubs_vi8.ispc | 11 ++++ tests/psubus_i16.ispc | 11 ++++ tests/psubus_i8.ispc | 11 ++++ tests/psubus_vi16.ispc | 11 ++++ tests/psubus_vi8.ispc | 11 ++++ 27 files changed, 288 insertions(+), 11 deletions(-) create mode 100644 tests/padds_i16.ispc create mode 100644 tests/padds_i8.ispc create mode 100644 tests/padds_vi16.ispc create mode 100644 tests/padds_vi8.ispc create mode 100644 tests/paddus_i16.ispc create mode 100644 tests/paddus_i8.ispc create mode 100644 tests/paddus_vi16.ispc create mode 100644 tests/paddus_vi8.ispc create mode 100644 tests/psubs_i16.ispc create mode 100644 tests/psubs_i8.ispc create mode 100644 tests/psubs_vi16.ispc create mode 100644 tests/psubs_vi8.ispc create mode 100644 tests/psubus_i16.ispc create mode 100644 tests/psubus_i8.ispc create mode 100644 tests/psubus_vi16.ispc create mode 100644 tests/psubus_vi8.ispc diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll index 8f20bfed..e98a3843 100644 --- a/builtins/target-avx.ll +++ b/builtins/target-avx.ll @@ -40,7 +40,6 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec8() include(`target-avx-common.ll') diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 9c86cab8..f0cf1efb 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -32,6 +32,7 @@ include(`target-avx.ll') rdrand_decls() +saturation_arithmetic_vec8() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index fea0a7c2..706314a5 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -34,6 +34,7 @@ include(`target-avx.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') +saturation_arithmetic_vec8() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index f4a0ee07..c5f8e84f 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -38,6 +38,7 @@ include(`target-avx.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') +saturation_arithmetic_vec8() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 910565dd..bb974932 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -9,6 +9,8 @@ packed_load_and_store() scans() int64minmax() aossoa() +saturation_arithmetic_scalar() +saturation_arithmetic_novec() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll index 807fd242..36a2ee4c 100644 --- a/builtins/target-generic-16.ll +++ b/builtins/target-generic-16.ll @@ -31,4 +31,4 @@ define(`WIDTH',`16') include(`target-generic-common.ll') - +saturation_arithmetic_vec16() diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll index 7eb1f300..a7e8dcaa 100644 --- a/builtins/target-generic-4.ll +++ b/builtins/target-generic-4.ll @@ -31,4 +31,4 @@ define(`WIDTH',`4') include(`target-generic-common.ll') - +saturation_arithmetic_vec4() diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll index bd9261ff..b692322e 100644 --- a/builtins/target-generic-8.ll +++ b/builtins/target-generic-8.ll @@ -31,4 +31,4 @@ define(`WIDTH',`8') include(`target-generic-common.ll') - +saturation_arithmetic_vec8() diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 92b7a18e..c4d3b950 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -41,6 +41,7 @@ stdlib_core() scans() reduce_equal(WIDTH) rdrand_decls() +saturation_arithmetic_scalar() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle diff --git a/builtins/util.m4 b/builtins/util.m4 index 0d5ed2de..e0f7aaec 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -50,12 +50,28 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector convertation utilities -;; convert 4-wide vector into 8-wide vector +;; convert 1-wide vector into 8-wide vector ;; ;; $1: vector element type -;; $2: 4-wide vector +;; $2: 1-wide vector ;; $3: 8-wide vector + +define(`convert1to8', ` + $3 = shufflevector <1 x $1> $2, <1 x $1> undef, + <8 x i32> +') + + +define(`convert1to16', ` + $3 = shufflevector <1 x $1> $2, <1 x $1> undef, + <16 x i32> +') + define(`convert4to8', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <8 x i32> $2, <8 x $1> undef, + <1 x i32> +') + + +define(`convert16to1', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <1 x i32> +') define(`convert8to4', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, @@ -204,6 +232,74 @@ define i16 @__psubus_i16(i16 %a0, i16 %a1) { } ') +;;no vector saturation arithmetic + +define(`saturation_arithmetic_novec', ` +define @__padds_vi8(, ) { + convert1to16(i8, %0, %v0) + convert1to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to1(i8, %r16, %r) + ret %r +} + +define @__padds_vi16(, ) { + convert1to8(i16, %0, %v0) + convert1to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to1(i16, %r16, %r) + ret %r +} + +define @__paddus_vi8(, ) { + convert1to16(i8, %0, %v0) + convert1to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to1(i8, %r16, %r) + ret %r +} + +define @__paddus_vi16(, ) { + convert1to8(i16, %0, %v0) + convert1to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to1(i16, %r16, %r) + ret %r +} + +define @__psubs_vi8(, ) { + convert1to16(i8, %0, %v0) + convert1to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to1(i8, %r16, %r) + ret %r +} + +define @__psubs_vi16(, ) { + convert1to8(i16, %0, %v0) + convert1to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to1(i16, %r16, %r) + ret %r +} + +define @__psubus_vi8(, ) { + convert1to16(i8, %0, %v0) + convert1to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to1(i8, %r16, %r) + ret %r +} + +define @__psubus_vi16(, ) { + convert1to8(i16, %0, %v0) + convert1to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to1(i16, %r16, %r) + ret %r +} +') + ;;4-wide vector saturation arithmetic define(`saturation_arithmetic_vec4', ` diff --git a/stdlib.ispc b/stdlib.ispc index 5b3d144c..9e296687 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4287,12 +4287,12 @@ static inline varying unsigned int8 paddus(varying unsigned int8 a, } static inline uniform unsigned int16 paddus(uniform unsigned int16 a, - unsigned uniform int16 b) { + uniform unsigned int16 b) { return __paddus_i16(a, b); } static inline varying unsigned int16 paddus(varying unsigned int16 a, - unsigned varying int16 b) { + varying unsigned int16 b) { return __paddus_vi16(a, b); } @@ -4323,12 +4323,12 @@ static inline varying unsigned int8 psubus(varying unsigned int8 a, } static inline uniform unsigned int16 psubus(uniform unsigned int16 a, - unsigned uniform int16 b) { + uniform unsigned int16 b) { return __psubus_i16(a, b); } static inline varying unsigned int16 psubus(varying unsigned int16 a, - unsigned varying int16 b) { + varying unsigned int16 b) { return __psubus_vi16(a, b); } /////////////////////////////////////////////////////////////////////////// diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc new file mode 100644 index 00000000..4668071b --- /dev/null +++ b/tests/padds_i16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a = 32767, b = 32767; // max signed int16 + RET[programIndex] = padds(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 32767; +} diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc new file mode 100644 index 00000000..81da8a21 --- /dev/null +++ b/tests/padds_i8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = 127, b = 127; // max signed int8 + RET[programIndex] = padds(a1, b1); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 127; +} diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc new file mode 100644 index 00000000..7c6848e7 --- /dev/null +++ b/tests/padds_vi16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = 32767, b = 32767; // max signed int16 + RET[programIndex] = padds(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 32767; +} diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc new file mode 100644 index 00000000..5d6196be --- /dev/null +++ b/tests/padds_vi8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = 127, b = 127; // max signed int8 + RET[programIndex] = padds(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 127; +} diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc new file mode 100644 index 00000000..d2939677 --- /dev/null +++ b/tests/paddus_i16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a = 65535, b = 65535; // max unsigned int16 + RET[programIndex] = paddus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 65535; +} diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc new file mode 100644 index 00000000..23de8c21 --- /dev/null +++ b/tests/paddus_i8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = 255, b = 255; // max unsigned int8 + RET[programIndex] = paddus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 255; +} diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc new file mode 100644 index 00000000..803259f5 --- /dev/null +++ b/tests/paddus_vi16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = 65535, b = 65535; // max unsigned int16 + RET[programIndex] = paddus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 65535; +} diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc new file mode 100644 index 00000000..3d7d3509 --- /dev/null +++ b/tests/paddus_vi8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = 255, b = 255; // max unsigned int8 + RET[programIndex] = paddus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 255; +} diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc new file mode 100644 index 00000000..9038215e --- /dev/null +++ b/tests/psubs_i16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = -32768, b = 32767; // min and max signed int16 + RET[programIndex] = psubs(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = -32768; +} diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc new file mode 100644 index 00000000..1a661520 --- /dev/null +++ b/tests/psubs_i8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = -128, b = 127; // min and max signed int8 + RET[programIndex] = psubs(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = -128; +} diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc new file mode 100644 index 00000000..b1e2cf48 --- /dev/null +++ b/tests/psubs_vi16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = -32768, b = 32767; // min and max unsigned int16 + RET[programIndex] = psubs(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = -32768; +} diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc new file mode 100644 index 00000000..a6148a3f --- /dev/null +++ b/tests/psubs_vi8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = -128, b = 127; // min and max unsigned int8 + RET[programIndex] = psubs(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = -128; +} diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc new file mode 100644 index 00000000..b31b250e --- /dev/null +++ b/tests/psubus_i16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = 0, b = 32767; // min and max unsigned int16 + RET[programIndex] = psubus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc new file mode 100644 index 00000000..c073d306 --- /dev/null +++ b/tests/psubus_i8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a = 0, b = 255; // min and max unsigned int8 + RET[programIndex] = psubus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc new file mode 100644 index 00000000..fd4db693 --- /dev/null +++ b/tests/psubus_vi16.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = 0, b = 32767; // min and max unsigned int16 + RET[programIndex] = psubus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc new file mode 100644 index 00000000..3c00308f --- /dev/null +++ b/tests/psubus_vi8.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = 0, b = 255; // min and max unsigned int8 + RET[programIndex] = psubus(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} From ea94658411fa3e2bfc11cb5c4b791a6143fe521f Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Fri, 6 Dec 2013 17:20:37 +0400 Subject: [PATCH 08/16] Some saturation tests fixes --- tests/padds_i8.ispc | 2 +- tests/paddus_i8.ispc | 2 +- tests/paddus_vi8.ispc | 2 +- tests/psubs_i16.ispc | 2 +- tests/psubus_i16.ispc | 2 +- tests/psubus_i8.ispc | 2 +- tests/psubus_vi8.ispc | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc index 81da8a21..d7bdc8b6 100644 --- a/tests/padds_i8.ispc +++ b/tests/padds_i8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int8 a = 127, b = 127; // max signed int8 - RET[programIndex] = padds(a1, b1); + RET[programIndex] = padds(a, b); } export void result(uniform float RET[]) { diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc index 23de8c21..1c585369 100644 --- a/tests/paddus_i8.ispc +++ b/tests/paddus_i8.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = 255, b = 255; // max unsigned int8 + uniform unsigned int8 a = 255, b = 255; // max unsigned int8 RET[programIndex] = paddus(a, b); } diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc index 3d7d3509..c9d7a115 100644 --- a/tests/paddus_vi8.ispc +++ b/tests/paddus_vi8.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = 255, b = 255; // max unsigned int8 + varying unsigned int8 a = 255, b = 255; // max unsigned int8 RET[programIndex] = paddus(a, b); } diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc index 9038215e..d66f51ad 100644 --- a/tests/psubs_i16.ispc +++ b/tests/psubs_i16.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = -32768, b = 32767; // min and max signed int16 + uniform int16 a = -32768, b = 32767; // min and max signed int16 RET[programIndex] = psubs(a, b); } diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc index b31b250e..c02922d2 100644 --- a/tests/psubus_i16.ispc +++ b/tests/psubus_i16.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = 0, b = 32767; // min and max unsigned int16 + uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16 RET[programIndex] = psubus(a, b); } diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc index c073d306..a45e9f6e 100644 --- a/tests/psubus_i8.ispc +++ b/tests/psubus_i8.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = 0, b = 255; // min and max unsigned int8 + uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8 RET[programIndex] = psubus(a, b); } diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc index 3c00308f..46005204 100644 --- a/tests/psubus_vi8.ispc +++ b/tests/psubus_vi8.ispc @@ -2,7 +2,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = 0, b = 255; // min and max unsigned int8 + varying unsigned int8 a = 0, b = 255; // min and max unsigned int8 RET[programIndex] = psubus(a, b); } From 9a135c48d95c84bde1a1038b0de6430087ca04d6 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Mon, 9 Dec 2013 00:20:52 +0400 Subject: [PATCH 09/16] Functions name change --- stdlib.ispc | 48 +++++++++++++++++++++--------------------- tests/padds_i16.ispc | 2 +- tests/padds_i8.ispc | 2 +- tests/padds_vi16.ispc | 2 +- tests/padds_vi8.ispc | 2 +- tests/paddus_i16.ispc | 4 ++-- tests/paddus_i8.ispc | 2 +- tests/paddus_vi16.ispc | 4 ++-- tests/paddus_vi8.ispc | 2 +- tests/psubs_i16.ispc | 2 +- tests/psubs_i8.ispc | 2 +- tests/psubs_vi16.ispc | 2 +- tests/psubs_vi8.ispc | 2 +- tests/psubus_i16.ispc | 2 +- tests/psubus_i8.ispc | 2 +- tests/psubus_vi16.ispc | 4 ++-- tests/psubus_vi8.ispc | 2 +- 17 files changed, 43 insertions(+), 43 deletions(-) diff --git a/stdlib.ispc b/stdlib.ispc index 9e296687..487b4184 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4260,75 +4260,75 @@ static inline void fastmath() { /////////////////////////////////////////////////////////////////////////// // saturation arithmetic -static inline uniform int8 padds(uniform int8 a, uniform int8 b) { +static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) { return __padds_i8(a, b); } -static inline varying int8 padds(varying int8 a, varying int8 b) { +static inline varying int8 saturating_add(varying int8 a, varying int8 b) { return __padds_vi8(a, b); } -static inline uniform int16 padds(uniform int16 a, uniform int16 b) { +static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) { return __padds_i16(a, b); } -static inline varying int16 padds(varying int16 a, varying int16 b) { +static inline varying int16 saturating_add(varying int16 a, varying int16 b) { return __padds_vi16(a, b); } -static inline uniform unsigned int8 paddus(uniform unsigned int8 a, - uniform unsigned int8 b) { +static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, + uniform unsigned int8 b) { return __paddus_i8(a, b); } -static inline varying unsigned int8 paddus(varying unsigned int8 a, - varying unsigned int8 b) { +static inline varying unsigned int8 saturating_add(varying unsigned int8 a, + varying unsigned int8 b) { return __paddus_vi8(a, b); } -static inline uniform unsigned int16 paddus(uniform unsigned int16 a, - uniform unsigned int16 b) { +static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, + uniform unsigned int16 b) { return __paddus_i16(a, b); } -static inline varying unsigned int16 paddus(varying unsigned int16 a, - varying unsigned int16 b) { +static inline varying unsigned int16 saturating_add(varying unsigned int16 a, + varying unsigned int16 b) { return __paddus_vi16(a, b); } -static inline uniform int8 psubs(uniform int8 a, uniform int8 b) { +static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) { return __psubs_i8(a, b); } -static inline varying int8 psubs(varying int8 a, varying int8 b) { +static inline varying int8 saturating_sub(varying int8 a, varying int8 b) { return __psubs_vi8(a, b); } -static inline uniform int16 psubs(uniform int16 a, uniform int16 b) { +static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) { return __psubs_i16(a, b); } -static inline varying int16 psubs(varying int16 a, varying int16 b) { +static inline varying int16 saturating_sub(varying int16 a, varying int16 b) { return __psubs_vi16(a, b); } -static inline uniform unsigned int8 psubus(uniform unsigned int8 a, - uniform unsigned int8 b) { +static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, + uniform unsigned int8 b) { return __psubus_i8(a, b); } -static inline varying unsigned int8 psubus(varying unsigned int8 a, - varying unsigned int8 b) { +static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, + varying unsigned int8 b) { return __psubus_vi8(a, b); } -static inline uniform unsigned int16 psubus(uniform unsigned int16 a, - uniform unsigned int16 b) { +static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, + uniform unsigned int16 b) { return __psubus_i16(a, b); } -static inline varying unsigned int16 psubus(varying unsigned int16 a, - varying unsigned int16 b) { +static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, + varying unsigned int16 b) { return __psubus_vi16(a, b); } /////////////////////////////////////////////////////////////////////////// diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc index 4668071b..930593ac 100644 --- a/tests/padds_i16.ispc +++ b/tests/padds_i16.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int16 a = 32767, b = 32767; // max signed int16 - RET[programIndex] = padds(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc index d7bdc8b6..6d72a61b 100644 --- a/tests/padds_i8.ispc +++ b/tests/padds_i8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int8 a = 127, b = 127; // max signed int8 - RET[programIndex] = padds(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc index 7c6848e7..b48d776a 100644 --- a/tests/padds_vi16.ispc +++ b/tests/padds_vi16.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying int16 a = 32767, b = 32767; // max signed int16 - RET[programIndex] = padds(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc index 5d6196be..71d42cb8 100644 --- a/tests/padds_vi8.ispc +++ b/tests/padds_vi8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying int8 a = 127, b = 127; // max signed int8 - RET[programIndex] = padds(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc index d2939677..968953fa 100644 --- a/tests/paddus_i16.ispc +++ b/tests/paddus_i16.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int16 a = 65535, b = 65535; // max unsigned int16 - RET[programIndex] = paddus(a, b); + uniform unsigned int16 a = 65535, b = 65535; // max unsigned int16 + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc index 1c585369..44c41a6c 100644 --- a/tests/paddus_i8.ispc +++ b/tests/paddus_i8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform unsigned int8 a = 255, b = 255; // max unsigned int8 - RET[programIndex] = paddus(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc index 803259f5..4d15e49b 100644 --- a/tests/paddus_vi16.ispc +++ b/tests/paddus_vi16.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = 65535, b = 65535; // max unsigned int16 - RET[programIndex] = paddus(a, b); + varying unsigned int16 a = 65535, b = 65535; // max unsigned int16 + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc index c9d7a115..77fcec7a 100644 --- a/tests/paddus_vi8.ispc +++ b/tests/paddus_vi8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying unsigned int8 a = 255, b = 255; // max unsigned int8 - RET[programIndex] = paddus(a, b); + RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc index d66f51ad..163af2da 100644 --- a/tests/psubs_i16.ispc +++ b/tests/psubs_i16.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int16 a = -32768, b = 32767; // min and max signed int16 - RET[programIndex] = psubs(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc index 1a661520..1dba8fe3 100644 --- a/tests/psubs_i8.ispc +++ b/tests/psubs_i8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform int8 a = -128, b = 127; // min and max signed int8 - RET[programIndex] = psubs(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc index b1e2cf48..3208e842 100644 --- a/tests/psubs_vi16.ispc +++ b/tests/psubs_vi16.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying int16 a = -32768, b = 32767; // min and max unsigned int16 - RET[programIndex] = psubs(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc index a6148a3f..143aaf4e 100644 --- a/tests/psubs_vi8.ispc +++ b/tests/psubs_vi8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying int8 a = -128, b = 127; // min and max unsigned int8 - RET[programIndex] = psubs(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc index c02922d2..bb62f03f 100644 --- a/tests/psubus_i16.ispc +++ b/tests/psubus_i16.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16 - RET[programIndex] = psubus(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc index a45e9f6e..176ecc33 100644 --- a/tests/psubus_i8.ispc +++ b/tests/psubus_i8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8 - RET[programIndex] = psubus(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc index fd4db693..ca58f374 100644 --- a/tests/psubus_vi16.ispc +++ b/tests/psubus_vi16.ispc @@ -2,8 +2,8 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = 0, b = 32767; // min and max unsigned int16 - RET[programIndex] = psubus(a, b); + varying unsigned int16 a = 0, b = 32767; // min and max unsigned int16 + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc index 46005204..e730fd7e 100644 --- a/tests/psubus_vi8.ispc +++ b/tests/psubus_vi8.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { varying unsigned int8 a = 0, b = 255; // min and max unsigned int8 - RET[programIndex] = psubus(a, b); + RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { From 07c6f1714a3500c767d3736d850d8996bbcf11a2 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Sun, 22 Dec 2013 19:28:26 +0400 Subject: [PATCH 10/16] Some fixes in function names and more tests was added. --- builtins/target-avx-common.ll | 2 +- builtins/target-avx-x2.ll | 2 +- builtins/target-avx1-i64x4base.ll | 2 +- builtins/target-avx1.ll | 2 +- builtins/target-avx11.ll | 3 +- builtins/target-avx2.ll | 3 +- builtins/target-generic-1.ll | 2 +- builtins/target-generic-16.ll | 2 +- builtins/target-generic-4.ll | 2 +- builtins/target-generic-8.ll | 2 +- builtins/target-generic-common.ll | 2 +- builtins/target-sse2-common.ll | 2 +- builtins/target-sse2-x2.ll | 2 +- builtins/target-sse2.ll | 2 +- builtins/target-sse4-16.ll | 2 +- builtins/target-sse4-8.ll | 2 +- builtins/target-sse4-common.ll | 2 +- builtins/target-sse4-x2.ll | 2 +- builtins/target-sse4.ll | 2 +- builtins/util.m4 | 125 ++++++++++++++++-------------- tests/padds_i16-2.ispc | 11 +++ tests/padds_i16.ispc | 6 +- tests/padds_i8-2.ispc | 11 +++ tests/padds_i8.ispc | 6 +- tests/padds_vi16-2.ispc | 11 +++ tests/padds_vi16.ispc | 4 +- tests/padds_vi8-2.ispc | 11 +++ tests/padds_vi8.ispc | 4 +- tests/paddus_i16.ispc | 6 +- tests/paddus_i8.ispc | 6 +- tests/paddus_vi16.ispc | 4 +- tests/paddus_vi8.ispc | 4 +- tests/psubs_i16-2.ispc | 11 +++ tests/psubs_i16.ispc | 6 +- tests/psubs_i8-2.ispc | 11 +++ tests/psubs_i8.ispc | 6 +- tests/psubs_vi16-2.ispc | 11 +++ tests/psubs_vi16.ispc | 4 +- tests/psubs_vi8-2.ispc | 11 +++ tests/psubs_vi8.ispc | 4 +- tests/psubus_i16.ispc | 6 +- tests/psubus_i8.ispc | 6 +- tests/psubus_vi16.ispc | 4 +- tests/psubus_vi8.ispc | 4 +- 44 files changed, 215 insertions(+), 118 deletions(-) create mode 100644 tests/padds_i16-2.ispc create mode 100644 tests/padds_i8-2.ispc create mode 100644 tests/padds_vi16-2.ispc create mode 100644 tests/padds_vi8-2.ispc create mode 100644 tests/psubs_i16-2.ispc create mode 100644 tests/psubs_i8-2.ispc create mode 100644 tests/psubs_vi16-2.ispc create mode 100644 tests/psubs_vi8-2.ispc diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index 32157a77..d6b577b8 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -40,7 +40,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() -saturation_arithmetic_scalar() +saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index cde63e7b..8d3e29c8 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -40,7 +40,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec16() +saturation_arithmetic() include(`target-avx-common.ll') diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index a2d292f2..d9c60c26 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -40,7 +40,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec4() +saturation_arithmetic() include(`target-avx-common.ll') diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index f0cf1efb..a9ddc112 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -32,7 +32,7 @@ include(`target-avx.ll') rdrand_decls() -saturation_arithmetic_vec8() +saturation_arithmetic() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index 706314a5..c4c421a0 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -34,7 +34,8 @@ include(`target-avx.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') -saturation_arithmetic_vec8() + +saturation_arithmetic() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index c5f8e84f..20ecef47 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -38,7 +38,8 @@ include(`target-avx.ll') ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') -saturation_arithmetic_vec8() + +saturation_arithmetic() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index bb974932..af343496 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -9,7 +9,7 @@ packed_load_and_store() scans() int64minmax() aossoa() -saturation_arithmetic_scalar() +saturation_arithmetic() saturation_arithmetic_novec() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll index 36a2ee4c..df04187c 100644 --- a/builtins/target-generic-16.ll +++ b/builtins/target-generic-16.ll @@ -31,4 +31,4 @@ define(`WIDTH',`16') include(`target-generic-common.ll') -saturation_arithmetic_vec16() +saturation_arithmetic() diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll index a7e8dcaa..e43f45c5 100644 --- a/builtins/target-generic-4.ll +++ b/builtins/target-generic-4.ll @@ -31,4 +31,4 @@ define(`WIDTH',`4') include(`target-generic-common.ll') -saturation_arithmetic_vec4() +saturation_arithmetic() diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll index b692322e..6b87509d 100644 --- a/builtins/target-generic-8.ll +++ b/builtins/target-generic-8.ll @@ -31,4 +31,4 @@ define(`WIDTH',`8') include(`target-generic-common.ll') -saturation_arithmetic_vec8() +saturation_arithmetic() diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index c4d3b950..6f5199d8 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -41,7 +41,7 @@ stdlib_core() scans() reduce_equal(WIDTH) rdrand_decls() -saturation_arithmetic_scalar() +saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index b5c5559c..d8a461aa 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -34,7 +34,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() -saturation_arithmetic_scalar() +saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index b4b52d91..1cb2abc4 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -44,7 +44,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec8() +saturation_arithmetic() include(`target-sse2-common.ll') diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index bdf6f848..ee8b533c 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -41,7 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec4() +saturation_arithmetic() include(`target-sse2-common.ll') diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 1c0b045a..00ff2519 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -41,7 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec8() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 49351856..15c577e8 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -41,7 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec16() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 8eeaa413..2dd5c149 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -37,7 +37,7 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() -saturation_arithmetic_scalar() +saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 2cd0ea4d..59a6942a 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -44,7 +44,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec8() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 96effe39..4762836d 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -41,7 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() -saturation_arithmetic_vec4() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/util.m4 b/builtins/util.m4 index e0f7aaec..6f36f71e 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -179,10 +179,17 @@ define(`convert32to16', ` ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;saturation arithmetic -;;scalar saturation arithmetic +;;saturation arithmetic + +define(`saturation_arithmetic', +`ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', + WIDTH, `8', `saturation_arithmetic_vec8()', + WIDTH, `16', `saturation_arithmetic_vec16()', + `saturation_arithmetic_uniform()')') -define(`saturation_arithmetic_scalar', ` +;;uniform saturation arithmetic + +define(`saturation_arithmetic_uniform', ` declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) @@ -303,168 +310,168 @@ define @__psubus_vi16(, ) { ;;4-wide vector saturation arithmetic define(`saturation_arithmetic_vec4', ` -define @__padds_vi8(, ) { +define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) - ret %r + ret <4 x i8> %r } -define @__padds_vi16(, ) { +define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) - ret %r + ret <4 x i16> %r } -define @__paddus_vi8(, ) { +define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) - ret %r + ret <4 x i8> %r } -define @__paddus_vi16(, ) { +define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) - ret %r + ret <4 x i16> %r } -define @__psubs_vi8(, ) { +define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) - ret %r + ret <4 x i8> %r } -define @__psubs_vi16(, ) { +define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) - ret %r + ret <4 x i16> %r } -define @__psubus_vi8(, ) { +define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to4(i8, %r16, %r) - ret %r + ret <4 x i8> %r } -define @__psubus_vi16(, ) { +define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) convert8to4(i16, %r16, %r) - ret %r + ret <4 x i16> %r } ') ;;8-wide vector saturation arithmetic define(`saturation_arithmetic_vec8', ` -define @__padds_vi8(, ) { +define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) - ret %r + ret <8 x i8> %r } -define @__padds_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.w( %a0, %a1) - ret %res +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res } -define @__paddus_vi8(, ) { +define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) - ret %r + ret <8 x i8> %r } -define @__paddus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.w( %a0, %a1) - ret %res +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res } -define @__psubs_vi8(, ) { +define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) - ret %r + ret <8 x i8> %r } -define @__psubs_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.w( %a0, %a1) - ret %res +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res } -define @__psubus_vi8(, ) { +define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) convert16to8(i8, %r16, %r) - ret %r + ret <8 x i8> %r } -define @__psubus_vi16( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.w( %a0, %a1) - ret %res +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res } ') ;;16-wide vector saturation arithmetic define(`saturation_arithmetic_vec16', ` -define @__padds_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.padds.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res } -define @__padds_vi16( %a0, %a1) { +define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) - ret %ret + ret <16 x i16> %ret } -define @__paddus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.paddus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res } -define @__paddus_vi16( %a0, %a1) { +define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret %ret + ret <16 x i16> %ret } -define @__psubs_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubs.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res } -define @__psubs_vi16( %a0, %a1) { +define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret %ret + ret <16 x i16> %ret } -define @__psubus_vi8( %a0, %a1) { - %res = call @llvm.x86.sse2.psubus.b( %a0, %a1) ; <<16 x i8>> [#uses=1] - ret %res +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res } -define @__psubus_vi16( %a0, %a1) { +define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) - ret %ret + ret <16 x i16> %ret } ') diff --git a/tests/padds_i16-2.ispc b/tests/padds_i16-2.ispc new file mode 100644 index 00000000..83234804 --- /dev/null +++ b/tests/padds_i16-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a = -32768; // min signed int16 + RET[programIndex] = saturating_add(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (uniform int16) -32768; +} diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc index 930593ac..e5456416 100644 --- a/tests/padds_i16.ispc +++ b/tests/padds_i16.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int16 a = 32767, b = 32767; // max signed int16 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a = 32767; // max signed int16 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 32767; + RET[programIndex] = (uniform int16) 32767; } diff --git a/tests/padds_i8-2.ispc b/tests/padds_i8-2.ispc new file mode 100644 index 00000000..9a303d70 --- /dev/null +++ b/tests/padds_i8-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a = -128; // min signed int8 + RET[programIndex] = saturating_add(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (uniform int8) -128; +} diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc index 6d72a61b..bbcc4cc7 100644 --- a/tests/padds_i8.ispc +++ b/tests/padds_i8.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = 127, b = 127; // max signed int8 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a = 127; // max signed int8 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 127; + RET[programIndex] = (uniform int8) 127; } diff --git a/tests/padds_vi16-2.ispc b/tests/padds_vi16-2.ispc new file mode 100644 index 00000000..5f1eda37 --- /dev/null +++ b/tests/padds_vi16-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = -32768, b = aFOO[programIndex]; // max signed int16 + RET[programIndex] = saturating_add(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (varying int16) -32768; +} diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc index b48d776a..e3bd0f51 100644 --- a/tests/padds_vi16.ispc +++ b/tests/padds_vi16.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = 32767, b = 32767; // max signed int16 + varying int16 a = 32767, b = aFOO[programIndex]; // max signed int16 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 32767; + RET[programIndex] = (varying int16) 32767; } diff --git a/tests/padds_vi8-2.ispc b/tests/padds_vi8-2.ispc new file mode 100644 index 00000000..e3302d18 --- /dev/null +++ b/tests/padds_vi8-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = -128, b = aFOO[programIndex]; // max signed int8 + RET[programIndex] = saturating_add(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (varying int8) -128; +} diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc index 71d42cb8..df921414 100644 --- a/tests/padds_vi8.ispc +++ b/tests/padds_vi8.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = 127, b = 127; // max signed int8 + varying int8 a = 127, b = aFOO[programIndex]; // max signed int8 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 127; + RET[programIndex] = (varying int8) 127; } diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc index 968953fa..e38f6db7 100644 --- a/tests/paddus_i16.ispc +++ b/tests/paddus_i16.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform unsigned int16 a = 65535, b = 65535; // max unsigned int16 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int16 a = 65535; // max unsigned int16 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 65535; + RET[programIndex] = (uniform unsigned int16) 65535; } diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc index 44c41a6c..7cd3ecf8 100644 --- a/tests/paddus_i8.ispc +++ b/tests/paddus_i8.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform unsigned int8 a = 255, b = 255; // max unsigned int8 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int8 a = 255; // max unsigned int8 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 255; + RET[programIndex] = (uniform unsigned int8) 255; } diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc index 4d15e49b..c4454cd2 100644 --- a/tests/paddus_vi16.ispc +++ b/tests/paddus_vi16.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int16 a = 65535, b = 65535; // max unsigned int16 + varying unsigned int16 a = 65535, b = aFOO[programIndex]; // max unsigned int16 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 65535; + RET[programIndex] = (varying unsigned int16) 65535; } diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc index 77fcec7a..b7b970ff 100644 --- a/tests/paddus_vi8.ispc +++ b/tests/paddus_vi8.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int8 a = 255, b = 255; // max unsigned int8 + varying unsigned int8 a = 255, b = aFOO[programIndex]; // max unsigned int8 RET[programIndex] = saturating_add(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 255; + RET[programIndex] = (varying unsigned int8) 255; } diff --git a/tests/psubs_i16-2.ispc b/tests/psubs_i16-2.ispc new file mode 100644 index 00000000..ace62b1c --- /dev/null +++ b/tests/psubs_i16-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a = 32767; // max signed int16 + RET[programIndex] = saturating_sub(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (uniform int16) 32767; +} diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc index 163af2da..47f3d2b9 100644 --- a/tests/psubs_i16.ispc +++ b/tests/psubs_i16.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int16 a = -32768, b = 32767; // min and max signed int16 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a = -32768; // min signed int16 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = -32768; + RET[programIndex] = (uniform int16) -32768; } diff --git a/tests/psubs_i8-2.ispc b/tests/psubs_i8-2.ispc new file mode 100644 index 00000000..6d3d608a --- /dev/null +++ b/tests/psubs_i8-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a = 127; // max signed int8 + RET[programIndex] = saturating_sub(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (uniform int8) 127; +} diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc index 1dba8fe3..fbc24d25 100644 --- a/tests/psubs_i8.ispc +++ b/tests/psubs_i8.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform int8 a = -128, b = 127; // min and max signed int8 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a = -128; // min signed int8 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = -128; + RET[programIndex] = (uniform int8) -128; } diff --git a/tests/psubs_vi16-2.ispc b/tests/psubs_vi16-2.ispc new file mode 100644 index 00000000..ef1b2ef4 --- /dev/null +++ b/tests/psubs_vi16-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int16 a = 32767, b = aFOO[programIndex]; // min unsigned int16 + RET[programIndex] = saturating_sub(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (varying int16) 32767; +} diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc index 3208e842..e405a23f 100644 --- a/tests/psubs_vi16.ispc +++ b/tests/psubs_vi16.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = -32768, b = 32767; // min and max unsigned int16 + varying int16 a = -32768, b = aFOO[programIndex]; // min unsigned int16 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = -32768; + RET[programIndex] = (varying int16) -32768; } diff --git a/tests/psubs_vi8-2.ispc b/tests/psubs_vi8-2.ispc new file mode 100644 index 00000000..b7fb02c6 --- /dev/null +++ b/tests/psubs_vi8-2.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + varying int8 a = 127, b = aFOO[programIndex]; // min unsigned int8 + RET[programIndex] = saturating_sub(a, -b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (varying int8) 127; +} diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc index 143aaf4e..7d852f0a 100644 --- a/tests/psubs_vi8.ispc +++ b/tests/psubs_vi8.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = -128, b = 127; // min and max unsigned int8 + varying int8 a = -128, b = aFOO[programIndex]; // min unsigned int8 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = -128; + RET[programIndex] = (varying int8) -128; } diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc index bb62f03f..a7f60603 100644 --- a/tests/psubus_i16.ispc +++ b/tests/psubus_i16.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform unsigned int8 a = 0, b = 32767; // min and max unsigned int16 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int8 a = 0; // min unsigned int16 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 0; + RET[programIndex] = (uniform unsigned int8) 0; } diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc index 176ecc33..7cb7ecdc 100644 --- a/tests/psubus_i8.ispc +++ b/tests/psubus_i8.ispc @@ -1,11 +1,11 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - uniform unsigned int8 a = 0, b = 255; // min and max unsigned int8 +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int8 a = 0; // min unsigned int8 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 0; + RET[programIndex] = (uniform unsigned int8) 0; } diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc index ca58f374..e441b699 100644 --- a/tests/psubus_vi16.ispc +++ b/tests/psubus_vi16.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int16 a = 0, b = 32767; // min and max unsigned int16 + varying unsigned int16 a = 0, b = aFOO[programIndex]; // min unsigned int16 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 0; + RET[programIndex] = (varying unsigned int16) 0; } diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc index e730fd7e..7ba5f14a 100644 --- a/tests/psubus_vi8.ispc +++ b/tests/psubus_vi8.ispc @@ -2,10 +2,10 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int8 a = 0, b = 255; // min and max unsigned int8 + varying unsigned int8 a = 0, b = aFOO[programIndex]; // min unsigned int8 RET[programIndex] = saturating_sub(a, b); } export void result(uniform float RET[]) { - RET[programIndex] = 0; + RET[programIndex] = (varying unsigned int8) 0; } From 323587f10f2f7a02104a627692afbffcda822cb5 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Thu, 2 Jan 2014 16:48:56 +0400 Subject: [PATCH 11/16] Scalar implementation and implementation for targets which don't have h/w instructions --- builtins/target-generic-16.ll | 2 +- builtins/target-generic-32.ll | 1 + builtins/target-generic-4.ll | 2 +- builtins/target-generic-64.ll | 1 + builtins/target-generic-8.ll | 2 +- builtins/util.m4 | 231 +++++++++++++++++++--------------- 6 files changed, 134 insertions(+), 105 deletions(-) diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll index df04187c..cc5644bc 100644 --- a/builtins/target-generic-16.ll +++ b/builtins/target-generic-16.ll @@ -31,4 +31,4 @@ define(`WIDTH',`16') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll index 5f89bcdf..8eb31c48 100644 --- a/builtins/target-generic-32.ll +++ b/builtins/target-generic-32.ll @@ -31,3 +31,4 @@ define(`WIDTH',`32') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll index e43f45c5..d80c5b91 100644 --- a/builtins/target-generic-4.ll +++ b/builtins/target-generic-4.ll @@ -31,4 +31,4 @@ define(`WIDTH',`4') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/target-generic-64.ll b/builtins/target-generic-64.ll index 09443f8e..6a044c41 100644 --- a/builtins/target-generic-64.ll +++ b/builtins/target-generic-64.ll @@ -31,3 +31,4 @@ define(`WIDTH',`64') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll index 6b87509d..4353658c 100644 --- a/builtins/target-generic-8.ll +++ b/builtins/target-generic-8.ll @@ -31,4 +31,4 @@ define(`WIDTH',`8') include(`target-generic-common.ll') -saturation_arithmetic() +saturation_arithmetic_novec() diff --git a/builtins/util.m4 b/builtins/util.m4 index 6f36f71e..de48a0a1 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -187,124 +187,151 @@ define(`saturation_arithmetic', WIDTH, `16', `saturation_arithmetic_vec16()', `saturation_arithmetic_uniform()')') -;;uniform saturation arithmetic +;; utility function used by saturation_arithmetic_uniform below. This shouldn't be called by +;; target .ll files directly. +;; $1: {add,sub} (used in constructing function names) -define(`saturation_arithmetic_uniform', ` -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__padds_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) +define(`saturation_arithmetic_uniform_universal', ` +declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__p$1s_i8(i8 %a0, i8 %a1) { + %a0_i16 = sext i8 %a0 to i16 + %a1_i16 = sext i8 %a1 to i16 + %res = $1 i16 %a0_i16, %a1_i16 + %over_mask = icmp sgt i16 %res, 127 + %over_res = select i1 %over_mask, i16 127, i16 %res + %under_mask = icmp slt i16 %res, -128 + %ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res + %ret = trunc i16 %ret_i16 to i8 ret i8 %ret } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__padds_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) +declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__p$1s_i16(i16 %a0, i16 %a1) { + %a0_i32 = sext i16 %a0 to i32 + %a1_i32 = sext i16 %a1 to i32 + %res = $1 i32 %a0_i32, %a1_i32 + %over_mask = icmp sgt i32 %res, 32767 + %over_res = select i1 %over_mask, i32 32767, i32 %res + %under_mask = icmp slt i32 %res, -32768 + %ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res + %ret = trunc i32 %ret_i32 to i16 ret i16 %ret } -declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__paddus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) +declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__p$1us_i8(i8 %a0, i8 %a1) { + %a0_i16 = zext i8 %a0 to i16 + %a1_i16 = zext i8 %a1 to i16 + %res = $1 i16 %a0_i16, %a1_i16 + %over_mask = icmp ugt i16 %res, 255 + %over_res = select i1 %over_mask, i16 255, i16 %res + %under_mask = icmp slt i16 %res, 0 + %ret_i16 = select i1 %under_mask, i16 0, i16 %over_res + %ret = trunc i16 %ret_i16 to i8 ret i8 %ret } -declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__paddus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubs_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubs_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__psubus_i8(i8 %a0, i8 %a1) { - sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__psubus_i16(i16 %a0, i16 %a1) { - sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) +declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__p$1us_i16(i16 %a0, i16 %a1) { + %a0_i32 = zext i16 %a0 to i32 + %a1_i32 = zext i16 %a1 to i32 + %res = $1 i32 %a0_i32, %a1_i32 + %over_mask = icmp ugt i32 %res, 65535 + %over_res = select i1 %over_mask, i32 65535, i32 %res + %under_mask = icmp slt i32 %res, 0 + %ret_i32 = select i1 %under_mask, i32 0, i32 %over_res + %ret = trunc i32 %ret_i32 to i16 ret i16 %ret } ') -;;no vector saturation arithmetic +;;uniform saturation arithmetic + +define(`saturation_arithmetic_uniform', ` +saturation_arithmetic_uniform_universal(sub) +saturation_arithmetic_uniform_universal(add) +') + +;; create vector constant. Used by saturation_arithmetic_novec_universal below. + +define(`const_vector', ` +ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + `<$1 $2>')') + +;; utility function used by saturation_arithmetic_novec below. This shouldn't be called by +;; target .ll files directly. +;; $1: {add,sub} (used in constructing function names) + +define(`saturation_arithmetic_novec_universal', ` +define @__p$1s_vi8(, ) { + %v0_i16 = sext %0 to + %v1_i16 = sext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp sgt %res, const_vector(i16, 127) + %over_res = select %over_mask, const_vector(i16, 127), %res + %under_mask = icmp slt %res, const_vector(i16, -128) + %ret_i16 = select %under_mask, const_vector(i16, -128), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1s_vi16(, ) { + %v0_i32 = sext %0 to + %v1_i32 = sext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp sgt %res, const_vector(i32, 32767) + %over_res = select %over_mask, const_vector(i32, 32767), %res + %under_mask = icmp slt %res, const_vector(i32, -32768) + %ret_i32 = select %under_mask, const_vector(i32, -32768), %over_res + %ret = trunc %ret_i32 to + ret %ret +} + +define @__p$1us_vi8(, ) { + %v0_i16 = zext %0 to + %v1_i16 = zext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp ugt %res, const_vector(i16, 255) + %over_res = select %over_mask, const_vector(i16, 255), %res + %under_mask = icmp slt %res, const_vector(i16, 0) + %ret_i16 = select %under_mask, const_vector(i16, 0), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1us_vi16(, ) { + %v0_i32 = zext %0 to + %v1_i32 = zext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp ugt %res, const_vector(i32, 65535) + %over_res = select %over_mask, const_vector(i32, 65535), %res + %under_mask = icmp slt %res, const_vector(i32, 0) + %ret_i32 = select %under_mask, const_vector(i32, 0), %over_res + %ret = trunc %ret_i32 to + ret %ret +} +') + +;; implementation for targets which doesn't have h/w instructions define(`saturation_arithmetic_novec', ` -define @__padds_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__padds_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__paddus_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__paddus_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__psubs_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__psubs_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} - -define @__psubus_vi8(, ) { - convert1to16(i8, %0, %v0) - convert1to16(i8, %1, %v1) - %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) - convert16to1(i8, %r16, %r) - ret %r -} - -define @__psubus_vi16(, ) { - convert1to8(i16, %0, %v0) - convert1to8(i16, %1, %v1) - %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) - convert8to1(i16, %r16, %r) - ret %r -} +saturation_arithmetic_novec_universal(sub) +saturation_arithmetic_novec_universal(add) ') ;;4-wide vector saturation arithmetic From 97cc5b7f485269e55931df34dd4de630cd4d848a Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Mon, 6 Jan 2014 15:24:09 +0400 Subject: [PATCH 12/16] Added varying CFG and non-overflow part of the tests. --- tests/padds_i16-2.ispc | 11 ----------- tests/padds_i16.ispc | 22 +++++++++++++++++++--- tests/padds_i8-2.ispc | 11 ----------- tests/padds_i8.ispc | 22 +++++++++++++++++++--- tests/padds_vi16-2.ispc | 11 ----------- tests/padds_vi16.ispc | 24 ++++++++++++++++++++---- tests/padds_vi8-2.ispc | 11 ----------- tests/padds_vi8.ispc | 24 ++++++++++++++++++++---- tests/paddus_i16.ispc | 16 +++++++++++++--- tests/paddus_i8.ispc | 16 +++++++++++++--- tests/paddus_vi16.ispc | 18 ++++++++++++++---- tests/paddus_vi8.ispc | 19 +++++++++++++++---- tests/psubs_i16-2.ispc | 11 ----------- tests/psubs_i16.ispc | 22 +++++++++++++++++++--- tests/psubs_i8-2.ispc | 11 ----------- tests/psubs_i8.ispc | 22 +++++++++++++++++++--- tests/psubs_vi16-2.ispc | 11 ----------- tests/psubs_vi16.ispc | 24 ++++++++++++++++++++---- tests/psubs_vi8-2.ispc | 11 ----------- tests/psubs_vi8.ispc | 24 ++++++++++++++++++++---- tests/psubus_i16.ispc | 16 +++++++++++++--- tests/psubus_i8.ispc | 16 +++++++++++++--- tests/psubus_vi16.ispc | 18 ++++++++++++++---- tests/psubus_vi8.ispc | 18 ++++++++++++++---- 24 files changed, 265 insertions(+), 144 deletions(-) delete mode 100644 tests/padds_i16-2.ispc delete mode 100644 tests/padds_i8-2.ispc delete mode 100644 tests/padds_vi16-2.ispc delete mode 100644 tests/padds_vi8-2.ispc delete mode 100644 tests/psubs_i16-2.ispc delete mode 100644 tests/psubs_i8-2.ispc delete mode 100644 tests/psubs_vi16-2.ispc delete mode 100644 tests/psubs_vi8-2.ispc diff --git a/tests/padds_i16-2.ispc b/tests/padds_i16-2.ispc deleted file mode 100644 index 83234804..00000000 --- a/tests/padds_i16-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 a = -32768; // min signed int16 - RET[programIndex] = saturating_add(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (uniform int16) -32768; -} diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc index e5456416..c763dd37 100644 --- a/tests/padds_i16.ispc +++ b/tests/padds_i16.ispc @@ -2,10 +2,26 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 a = 32767; // max signed int16 - RET[programIndex] = saturating_add(a, b); + uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform int16) 32767; + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int16) 32767; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int16) -32768; + } + else { + RET[programIndex] = (uniform int16) -32763; + } } diff --git a/tests/padds_i8-2.ispc b/tests/padds_i8-2.ispc deleted file mode 100644 index 9a303d70..00000000 --- a/tests/padds_i8-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 a = -128; // min signed int8 - RET[programIndex] = saturating_add(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (uniform int8) -128; -} diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc index bbcc4cc7..7d272828 100644 --- a/tests/padds_i8.ispc +++ b/tests/padds_i8.ispc @@ -2,10 +2,26 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 a = 127; // max signed int8 - RET[programIndex] = saturating_add(a, b); + uniform int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform int8) 127; + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int8) 127; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int8) -128; + } + else { + RET[programIndex] = (uniform int8) -123; + } } diff --git a/tests/padds_vi16-2.ispc b/tests/padds_vi16-2.ispc deleted file mode 100644 index 5f1eda37..00000000 --- a/tests/padds_vi16-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = -32768, b = aFOO[programIndex]; // max signed int16 - RET[programIndex] = saturating_add(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (varying int16) -32768; -} diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc index e3bd0f51..5834a47a 100644 --- a/tests/padds_vi16.ispc +++ b/tests/padds_vi16.ispc @@ -1,11 +1,27 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = 32767, b = aFOO[programIndex]; // max signed int16 - RET[programIndex] = saturating_add(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying int16) 32767; + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int16) 32767; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int16) -32768; + } + else { + RET[programIndex] = (varying int16) -32763; + } } diff --git a/tests/padds_vi8-2.ispc b/tests/padds_vi8-2.ispc deleted file mode 100644 index e3302d18..00000000 --- a/tests/padds_vi8-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = -128, b = aFOO[programIndex]; // max signed int8 - RET[programIndex] = saturating_add(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (varying int8) -128; -} diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc index df921414..0aca03d4 100644 --- a/tests/padds_vi8.ispc +++ b/tests/padds_vi8.ispc @@ -1,11 +1,27 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = 127, b = aFOO[programIndex]; // max signed int8 - RET[programIndex] = saturating_add(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying int8) 127; + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int8) 127; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int8) -128; + } + else { + RET[programIndex] = (varying int8) -123; + } } diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc index e38f6db7..2032f161 100644 --- a/tests/paddus_i16.ispc +++ b/tests/paddus_i16.ispc @@ -2,10 +2,20 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform unsigned int16 a = 65535; // max unsigned int16 - RET[programIndex] = saturating_add(a, b); + uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform unsigned int16) 65535; + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int16) 65535; + } + else { + RET[programIndex] = (uniform unsigned int16) 5; + } } diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc index 7cd3ecf8..97436a86 100644 --- a/tests/paddus_i8.ispc +++ b/tests/paddus_i8.ispc @@ -2,10 +2,20 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform unsigned int8 a = 255; // max unsigned int8 - RET[programIndex] = saturating_add(a, b); + uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform unsigned int8) 255; + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int8) 255; + } + else { + RET[programIndex] = (uniform unsigned int8) 5; + } } diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc index c4454cd2..d8bfa000 100644 --- a/tests/paddus_vi16.ispc +++ b/tests/paddus_vi16.ispc @@ -1,11 +1,21 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int16 a = 65535, b = aFOO[programIndex]; // max unsigned int16 - RET[programIndex] = saturating_add(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying unsigned int16) 65535; + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int16) 65535; + } + else { + RET[programIndex] = (varying unsigned int16) 5; + } } diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc index b7b970ff..59baa6fb 100644 --- a/tests/paddus_vi8.ispc +++ b/tests/paddus_vi8.ispc @@ -1,11 +1,22 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int8 a = 255, b = aFOO[programIndex]; // max unsigned int8 - RET[programIndex] = saturating_add(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying unsigned int8) 255; + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int8) 255; + } + else { + RET[programIndex] = (varying unsigned int8) 5; + } } + diff --git a/tests/psubs_i16-2.ispc b/tests/psubs_i16-2.ispc deleted file mode 100644 index ace62b1c..00000000 --- a/tests/psubs_i16-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 a = 32767; // max signed int16 - RET[programIndex] = saturating_sub(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (uniform int16) 32767; -} diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc index 47f3d2b9..4f27b3b4 100644 --- a/tests/psubs_i16.ispc +++ b/tests/psubs_i16.ispc @@ -2,10 +2,26 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 a = -32768; // min signed int16 - RET[programIndex] = saturating_sub(a, b); + uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform int16) -32768; + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int16) -32768; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int16) 32767; + } + else { + RET[programIndex] = (uniform int16) 32762; + } } diff --git a/tests/psubs_i8-2.ispc b/tests/psubs_i8-2.ispc deleted file mode 100644 index 6d3d608a..00000000 --- a/tests/psubs_i8-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 a = 127; // max signed int8 - RET[programIndex] = saturating_sub(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (uniform int8) 127; -} diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc index fbc24d25..e04867bd 100644 --- a/tests/psubs_i8.ispc +++ b/tests/psubs_i8.ispc @@ -2,10 +2,26 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 a = -128; // min signed int8 - RET[programIndex] = saturating_sub(a, b); + uniform int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform int8) -128; + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int8) -128; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int8) 127; + } + else { + RET[programIndex] = (uniform int8) 122; + } } diff --git a/tests/psubs_vi16-2.ispc b/tests/psubs_vi16-2.ispc deleted file mode 100644 index ef1b2ef4..00000000 --- a/tests/psubs_vi16-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = 32767, b = aFOO[programIndex]; // min unsigned int16 - RET[programIndex] = saturating_sub(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (varying int16) 32767; -} diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc index e405a23f..df130115 100644 --- a/tests/psubs_vi16.ispc +++ b/tests/psubs_vi16.ispc @@ -1,11 +1,27 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int16 a = -32768, b = aFOO[programIndex]; // min unsigned int16 - RET[programIndex] = saturating_sub(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying int16) -32768; + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int16) -32768; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int16) 32767; + } + else { + RET[programIndex] = (varying int16) 32762; + } } diff --git a/tests/psubs_vi8-2.ispc b/tests/psubs_vi8-2.ispc deleted file mode 100644 index b7fb02c6..00000000 --- a/tests/psubs_vi8-2.ispc +++ /dev/null @@ -1,11 +0,0 @@ - -export uniform int width() { return programCount; } - -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = 127, b = aFOO[programIndex]; // min unsigned int8 - RET[programIndex] = saturating_sub(a, -b); -} - -export void result(uniform float RET[]) { - RET[programIndex] = (varying int8) 127; -} diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc index 7d852f0a..d7e9ff89 100644 --- a/tests/psubs_vi8.ispc +++ b/tests/psubs_vi8.ispc @@ -1,11 +1,27 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying int8 a = -128, b = aFOO[programIndex]; // min unsigned int8 - RET[programIndex] = saturating_sub(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying int8) -128; + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int8) -128; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int8) 127; + } + else { + RET[programIndex] = (varying int8) 122; + } } diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc index a7f60603..f9ae3568 100644 --- a/tests/psubus_i16.ispc +++ b/tests/psubus_i16.ispc @@ -2,10 +2,20 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform unsigned int8 a = 0; // min unsigned int16 - RET[programIndex] = saturating_sub(a, b); + uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform unsigned int8) 0; + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int16) 0; + } + else { + RET[programIndex] = (uniform unsigned int16) 65530; + } } diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc index 7cb7ecdc..e6f30b2a 100644 --- a/tests/psubus_i8.ispc +++ b/tests/psubus_i8.ispc @@ -2,10 +2,20 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform unsigned int8 a = 0; // min unsigned int8 - RET[programIndex] = saturating_sub(a, b); + uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (uniform unsigned int8) 0; + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int8) 0; + } + else { + RET[programIndex] = (uniform unsigned int8) 250; + } } diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc index e441b699..0974cc5e 100644 --- a/tests/psubus_vi16.ispc +++ b/tests/psubus_vi16.ispc @@ -1,11 +1,21 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int16 a = 0, b = aFOO[programIndex]; // min unsigned int16 - RET[programIndex] = saturating_sub(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying unsigned int16) 0; + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int16) 0; + } + else { + RET[programIndex] = (varying unsigned int16) 65530; + } } diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc index 7ba5f14a..f7ad65d3 100644 --- a/tests/psubus_vi8.ispc +++ b/tests/psubus_vi8.ispc @@ -1,11 +1,21 @@ export uniform int width() { return programCount; } -export void f_f(uniform float RET[], uniform float aFOO[]) { - varying unsigned int8 a = 0, b = aFOO[programIndex]; // min unsigned int8 - RET[programIndex] = saturating_sub(a, b); +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } } export void result(uniform float RET[]) { - RET[programIndex] = (varying unsigned int8) 0; + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int8) 0; + } + else { + RET[programIndex] = (varying unsigned int8) 250; + } } From da02236b3ac1d1a663949e81900ab8dbe71111a4 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Mon, 20 Jan 2014 16:06:34 +0400 Subject: [PATCH 13/16] Scalar realization of no-vec functions was replaced from builtins to stdlib.ispc. --- builtins.cpp | 8 --- builtins/target-avx-common.ll | 1 - builtins/target-generic-common.ll | 1 - builtins/target-sse2-common.ll | 1 - builtins/target-sse4-common.ll | 1 - builtins/util.m4 | 98 ++++++++++--------------------- stdlib.ispc | 72 ++++++++++++++++++++--- 7 files changed, 94 insertions(+), 88 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index c6828a00..b693ad3a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -488,12 +488,8 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", - "__padds_i8", - "__padds_i16", "__padds_vi8", "__padds_vi16", - "__paddus_i8", - "__paddus_i16", "__paddus_vi8", "__paddus_vi16", "__popcnt_int32", @@ -502,12 +498,8 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", - "__psubs_i8", - "__psubs_i16", "__psubs_vi8", "__psubs_vi16", - "__psubus_i8", - "__psubus_i16", "__psubus_vi8", "__psubus_vi16", "__rcp_uniform_float", diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index d6b577b8..dcca74f0 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -40,7 +40,6 @@ ctlztz() define_prefetches() define_shuffles() aossoa() -saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 6f5199d8..92b7a18e 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -41,7 +41,6 @@ stdlib_core() scans() reduce_equal(WIDTH) rdrand_decls() -saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index d8a461aa..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -34,7 +34,6 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() -saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 2dd5c149..50dd0582 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -37,7 +37,6 @@ define_prefetches() define_shuffles() aossoa() rdrand_decls() -saturation_arithmetic_uniform() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/util.m4 b/builtins/util.m4 index de48a0a1..4bdc501b 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -184,73 +184,7 @@ define(`convert32to16', ` define(`saturation_arithmetic', `ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', WIDTH, `8', `saturation_arithmetic_vec8()', - WIDTH, `16', `saturation_arithmetic_vec16()', - `saturation_arithmetic_uniform()')') - -;; utility function used by saturation_arithmetic_uniform below. This shouldn't be called by -;; target .ll files directly. -;; $1: {add,sub} (used in constructing function names) - -define(`saturation_arithmetic_uniform_universal', ` -declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__p$1s_i8(i8 %a0, i8 %a1) { - %a0_i16 = sext i8 %a0 to i16 - %a1_i16 = sext i8 %a1 to i16 - %res = $1 i16 %a0_i16, %a1_i16 - %over_mask = icmp sgt i16 %res, 127 - %over_res = select i1 %over_mask, i16 127, i16 %res - %under_mask = icmp slt i16 %res, -128 - %ret_i16 = select i1 %under_mask, i16 -128, i16 %over_res - %ret = trunc i16 %ret_i16 to i8 - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__p$1s_i16(i16 %a0, i16 %a1) { - %a0_i32 = sext i16 %a0 to i32 - %a1_i32 = sext i16 %a1 to i32 - %res = $1 i32 %a0_i32, %a1_i32 - %over_mask = icmp sgt i32 %res, 32767 - %over_res = select i1 %over_mask, i32 32767, i32 %res - %under_mask = icmp slt i32 %res, -32768 - %ret_i32 = select i1 %under_mask, i32 -32768, i32 %over_res - %ret = trunc i32 %ret_i32 to i16 - ret i16 %ret -} - -declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone -define i8 @__p$1us_i8(i8 %a0, i8 %a1) { - %a0_i16 = zext i8 %a0 to i16 - %a1_i16 = zext i8 %a1 to i16 - %res = $1 i16 %a0_i16, %a1_i16 - %over_mask = icmp ugt i16 %res, 255 - %over_res = select i1 %over_mask, i16 255, i16 %res - %under_mask = icmp slt i16 %res, 0 - %ret_i16 = select i1 %under_mask, i16 0, i16 %over_res - %ret = trunc i16 %ret_i16 to i8 - ret i8 %ret -} - -declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone -define i16 @__p$1us_i16(i16 %a0, i16 %a1) { - %a0_i32 = zext i16 %a0 to i32 - %a1_i32 = zext i16 %a1 to i32 - %res = $1 i32 %a0_i32, %a1_i32 - %over_mask = icmp ugt i32 %res, 65535 - %over_res = select i1 %over_mask, i32 65535, i32 %res - %under_mask = icmp slt i32 %res, 0 - %ret_i32 = select i1 %under_mask, i32 0, i32 %over_res - %ret = trunc i32 %ret_i32 to i16 - ret i16 %ret -} -') - -;;uniform saturation arithmetic - -define(`saturation_arithmetic_uniform', ` -saturation_arithmetic_uniform_universal(sub) -saturation_arithmetic_uniform_universal(add) -') + WIDTH, `16', `saturation_arithmetic_vec16()')') ;; create vector constant. Used by saturation_arithmetic_novec_universal below. @@ -278,6 +212,7 @@ ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', ;; $1: {add,sub} (used in constructing function names) define(`saturation_arithmetic_novec_universal', ` +declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone define @__p$1s_vi8(, ) { %v0_i16 = sext %0 to %v1_i16 = sext %1 to @@ -290,6 +225,7 @@ define @__p$1s_vi8(, ) { ret %ret } +declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone define @__p$1s_vi16(, ) { %v0_i32 = sext %0 to %v1_i32 = sext %1 to @@ -302,6 +238,7 @@ define @__p$1s_vi16(, ) { ret %ret } +declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone define @__p$1us_vi8(, ) { %v0_i16 = zext %0 to %v1_i16 = zext %1 to @@ -313,7 +250,8 @@ define @__p$1us_vi8(, ) { %ret = trunc %ret_i16 to ret %ret } - + +declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone define @__p$1us_vi16(, ) { %v0_i32 = zext %0 to %v1_i32 = zext %1 to @@ -337,6 +275,7 @@ saturation_arithmetic_novec_universal(add) ;;4-wide vector saturation arithmetic define(`saturation_arithmetic_vec4', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) @@ -345,6 +284,7 @@ define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) { ret <4 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) @@ -353,6 +293,7 @@ define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) { ret <4 x i16> %r } +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) @@ -361,6 +302,7 @@ define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) { ret <4 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) @@ -369,6 +311,7 @@ define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) { ret <4 x i16> %r } +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) @@ -377,6 +320,7 @@ define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) { ret <4 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) @@ -385,6 +329,7 @@ define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) { ret <4 x i16> %r } +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) { convert4to16(i8, %0, %v0) convert4to16(i8, %1, %v1) @@ -393,6 +338,7 @@ define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) { ret <4 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) { convert4to8(i16, %0, %v0) convert4to8(i16, %1, %v1) @@ -405,6 +351,7 @@ define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) { ;;8-wide vector saturation arithmetic define(`saturation_arithmetic_vec8', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) @@ -413,11 +360,13 @@ define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) { ret <8 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) @@ -426,11 +375,13 @@ define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) { ret <8 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) @@ -439,11 +390,13 @@ define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) { ret <8 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res } +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) { convert8to16(i8, %0, %v0) convert8to16(i8, %1, %v1) @@ -452,6 +405,7 @@ define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) { ret <8 x i8> %r } +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ret <8 x i16> %res @@ -461,41 +415,49 @@ define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { ;;16-wide vector saturation arithmetic define(`saturation_arithmetic_vec16', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret <16 x i16> %ret } +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret <16 x i16> %ret } +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret <16 x i16> %ret } +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <16 x i8> %res } +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret <16 x i16> %ret diff --git a/stdlib.ispc b/stdlib.ispc index 487b4184..f977abf8 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -57,6 +57,34 @@ #error Unknown value of ISPC_MASK_BITS #endif +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32768) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483648) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295) +#endif /////////////////////////////////////////////////////////////////////////// // Low level primitives @@ -4261,7 +4289,12 @@ static inline void fastmath() { // saturation arithmetic static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) { - return __padds_i8(a, b); + uniform unsigned int8 a_unsig = a, b_unsig = b; + uniform unsigned int8 result = a_unsig + b_unsig; + a_unsig = (a_unsig >> 7) + INT8_MAX; + if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0) + result = a_unsig; + return result; } static inline varying int8 saturating_add(varying int8 a, varying int8 b) { @@ -4269,7 +4302,12 @@ static inline varying int8 saturating_add(varying int8 a, varying int8 b) { } static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) { - return __padds_i16(a, b); + uniform unsigned int16 a_unsig = a, b_unsig = b; + uniform unsigned int16 result = a_unsig + b_unsig; + a_unsig = (a_unsig >> 15) + INT16_MAX; + if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0) + result = a_unsig; + return result; } static inline varying int16 saturating_add(varying int16 a, varying int16 b) { @@ -4278,7 +4316,9 @@ static inline varying int16 saturating_add(varying int16 a, varying int16 b) { static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, uniform unsigned int8 b) { - return __paddus_i8(a, b); + uniform unsigned int8 result = a + b; + result |= (-(uniform int8)(result < a)); + return result; } static inline varying unsigned int8 saturating_add(varying unsigned int8 a, @@ -4288,7 +4328,9 @@ static inline varying unsigned int8 saturating_add(varying unsigned int8 a, static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, uniform unsigned int16 b) { - return __paddus_i16(a, b); + uniform unsigned int16 result = a + b; + result |= (-(uniform int16)(result < a)); + return result; } static inline varying unsigned int16 saturating_add(varying unsigned int16 a, @@ -4297,7 +4339,12 @@ static inline varying unsigned int16 saturating_add(varying unsigned int16 a, } static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) { - return __psubs_i8(a, b); + uniform unsigned int8 a_unsig = a, b_unsig = b; + uniform unsigned int8 result = a_unsig - b_unsig; + a_unsig = (a_unsig >> 7) + INT8_MAX; + if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0) + result = a_unsig; + return result; } static inline varying int8 saturating_sub(varying int8 a, varying int8 b) { @@ -4305,7 +4352,12 @@ static inline varying int8 saturating_sub(varying int8 a, varying int8 b) { } static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) { - return __psubs_i16(a, b); + uniform unsigned int16 a_unsig = a, b_unsig = b; + uniform unsigned int16 result = a_unsig - b_unsig; + a_unsig = (a_unsig >> 15) + INT16_MAX; + if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0) + result = a_unsig; + return result; } static inline varying int16 saturating_sub(varying int16 a, varying int16 b) { @@ -4314,7 +4366,9 @@ static inline varying int16 saturating_sub(varying int16 a, varying int16 b) { static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, uniform unsigned int8 b) { - return __psubus_i8(a, b); + uniform unsigned int8 result = a - b; + result &= (-(uniform int8)(result <= a)); + return result; } static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, @@ -4324,7 +4378,9 @@ static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, uniform unsigned int16 b) { - return __psubus_i16(a, b); + uniform unsigned int16 result = a - b; + result &= (-(uniform int16)(result <= a)); + return result; } static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, From 1c1614d20755d441074b4084ee41d76e74464b2a Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Sun, 9 Feb 2014 21:39:42 +0400 Subject: [PATCH 14/16] Some errors in comments and code were fixed --- builtins/util.m4 | 17 +++-------------- stdlib.ispc | 27 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 4bdc501b..a991ae09 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -50,11 +50,11 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector convertation utilities -;; convert 1-wide vector into 8-wide vector +;; convert vector of one width into vector of other width ;; ;; $1: vector element type -;; $2: 1-wide vector -;; $3: 8-wide vector +;; $2: vector of the first width +;; $3: vector of the second width define(`convert1to8', ` @@ -129,13 +129,6 @@ define(`convert16to32', ` i32 undef, i32 undef, i32 undef, i32 undef> ') -;; convert 4-wide vector into 8-wide vector -;; -;; $1: vector element type -;; $2: 8-wide vector -;; $3: 1-wide vector - - define(`convert8to1', ` $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <1 x i32> @@ -212,7 +205,6 @@ ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', ;; $1: {add,sub} (used in constructing function names) define(`saturation_arithmetic_novec_universal', ` -declare <16 x i8> @llvm.x86.sse2.p$1s.b(<16 x i8>, <16 x i8>) nounwind readnone define @__p$1s_vi8(, ) { %v0_i16 = sext %0 to %v1_i16 = sext %1 to @@ -225,7 +217,6 @@ define @__p$1s_vi8(, ) { ret %ret } -declare <8 x i16> @llvm.x86.sse2.p$1s.w(<8 x i16>, <8 x i16>) nounwind readnone define @__p$1s_vi16(, ) { %v0_i32 = sext %0 to %v1_i32 = sext %1 to @@ -238,7 +229,6 @@ define @__p$1s_vi16(, ) { ret %ret } -declare <16 x i8> @llvm.x86.sse2.p$1us.b(<16 x i8>, <16 x i8>) nounwind readnone define @__p$1us_vi8(, ) { %v0_i16 = zext %0 to %v1_i16 = zext %1 to @@ -251,7 +241,6 @@ define @__p$1us_vi8(, ) { ret %ret } -declare <8 x i16> @llvm.x86.sse2.p$1us.w(<8 x i16>, <8 x i16>) nounwind readnone define @__p$1us_vi16(, ) { %v0_i32 = zext %0 to %v1_i32 = zext %1 to diff --git a/stdlib.ispc b/stdlib.ispc index f977abf8..9bb13f4e 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -58,15 +58,6 @@ #endif /* Limits of integral types. */ -#ifndef INT8_MIN -#define INT8_MIN (-128) -#endif -#ifndef INT16_MIN -#define INT16_MIN (-32768) -#endif -#ifndef INT32_MIN -#define INT32_MIN (-2147483648) -#endif #ifndef INT8_MAX #define INT8_MAX (127) #endif @@ -76,6 +67,9 @@ #ifndef INT32_MAX #define INT32_MAX (2147483647) #endif +#ifndef INT64_MAX +#define INT64_MAX (9223372036854775807) +#endif #ifndef UINT8_MAX #define UINT8_MAX (255) #endif @@ -85,6 +79,21 @@ #ifndef UINT32_MAX #define UINT32_MAX (4294967295) #endif +#ifndef UINT64_MAX +#define UINT64_MAX (18446744073709551615) +#endif +#ifndef INT8_MIN +#define INT8_MIN (-INT8_MAX - 1) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-INT16_MAX - 1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-INT32_MAX - 1) +#endif +#ifndef INT64_MIN +#define INT64_MIN (-INT64_MAX - 1) +#endif /////////////////////////////////////////////////////////////////////////// // Low level primitives From 65d947e44905d10bf1a2edde5a5c39bd7533c987 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Mon, 10 Feb 2014 15:18:48 +0400 Subject: [PATCH 15/16] Else branch with error report was added --- builtins/util.m4 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/builtins/util.m4 b/builtins/util.m4 index 86051436..df2adab2 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -177,7 +177,8 @@ define(`convert32to16', ` define(`saturation_arithmetic', `ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', WIDTH, `8', `saturation_arithmetic_vec8()', - WIDTH, `16', `saturation_arithmetic_vec16()')') + WIDTH, `16', `saturation_arithmetic_vec16() ', + `ERROR_unappropriate_width')') ;; create vector constant. Used by saturation_arithmetic_novec_universal below. From ea0a514e03eb6389099feb1b918d3c2378221a44 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 11 Feb 2014 15:33:23 +0400 Subject: [PATCH 16/16] Fix for generic-1 --- builtins/target-generic-1.ll | 1 - builtins/util.m4 | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 94ffe87e..a3de92f3 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -10,7 +10,6 @@ packed_load_and_store() scans() int64minmax() aossoa() -saturation_arithmetic() saturation_arithmetic_novec() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/builtins/util.m4 b/builtins/util.m4 index df2adab2..025018e9 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -178,7 +178,10 @@ define(`saturation_arithmetic', `ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', WIDTH, `8', `saturation_arithmetic_vec8()', WIDTH, `16', `saturation_arithmetic_vec16() ', - `ERROR_unappropriate_width')') + `errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH +) + m4exit(`1')') +') ;; create vector constant. Used by saturation_arithmetic_novec_universal below.