diff --git a/builtins.cpp b/builtins.cpp index 2afd92d9..c001318a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -488,12 +488,21 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", + "__padds_i8", + "__padds_i16", + "__vpadds_i8", + "__paddus_i8", + "__paddus_i16", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", + "__psubs_i8", + "__psubs_i16", + "__psubus_i8", + "__psubus_i16", "__rcp_uniform_float", "__rcp_varying_float", "__rdrand_i16", diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index ad1d88bc..070912ea 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -35,6 +35,64 @@ define_shuffles() aossoa() rdrand_decls() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) { + ; CHECK: vpaddsb + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/stdlib.ispc b/stdlib.ispc index 6768594b..464da5d4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4257,6 +4257,41 @@ static inline void fastmath() { __fastmath(); } +/////////////////////////////////////////////////////////////////////////// +// saturation arithmetic + +static inline uniform int8 padds(uniform int8 a, uniform int8 b) { + return __padds_i8(a, b); +} + +static inline uniform int16 padds(uniform int16 a, uniform int16 b) { + return __padds_i16(a, b); +} + +static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) { + return __paddus_i8(a, b); +} + +static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) { + return __paddus_i16(a, b); +} + +static inline uniform int8 psubs(uniform int8 a, uniform int8 b) { + return __psubs_i8(a, b); +} + +static inline uniform int16 psubs(uniform int16 a, uniform int16 b) { + return __psubs_i16(a, b); +} + +static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) { + return __psubus_i8(a, b); +} + +static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) { + return __psubus_i16(a, b); +} + /////////////////////////////////////////////////////////////////////////// // rdrand