From 35a4d1b3a27b88dce6f6c7e9e6eeef7bc5fbdf99 Mon Sep 17 00:00:00 2001 From: Vsevolod Livinskij Date: Wed, 27 Nov 2013 00:55:57 +0400 Subject: [PATCH] Add some AVX2 intrinsics --- builtins.cpp | 9 +++- builtins/target-avx-common.ll | 96 +++++++++++++++++++++++++++++++++ builtins/target-sse2-common.ll | 55 ++++++++++++++++--- builtins/target-sse4-common.ll | 98 ++++++++++++++++++++++++++++++++++ stdlib.ispc | 47 ++++++++++++++-- 5 files changed, 292 insertions(+), 13 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index c001318a..c6828a00 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -490,9 +490,12 @@ lSetInternalFunctions(llvm::Module *module) { "__packed_store_active", "__padds_i8", "__padds_i16", - "__vpadds_i8", + "__padds_vi8", + "__padds_vi16", "__paddus_i8", "__paddus_i16", + "__paddus_vi8", + "__paddus_vi16", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", @@ -501,8 +504,12 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__psubs_i8", "__psubs_i16", + "__psubs_vi8", + "__psubs_vi16", "__psubus_i8", "__psubus_i16", + "__psubus_vi8", + "__psubus_vi16", "__rcp_uniform_float", "__rcp_varying_float", "__rdrand_i16", diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index 1c467476..d47145f2 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -41,7 +41,103 @@ define_prefetches() define_shuffles() aossoa() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.padds.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__padds_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.padds.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.paddus.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__paddus_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.paddus.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubs.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__psubs_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubs.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 32, i8, @llvm.x86.avx2.psubus.b, %a0, %a1) + ret i8 %ret +} + +define <32 x i8> @__psubus_vi8(<32 x i8> %a0, <32 x i8> %a1) { + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} + + +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 16, i16, @llvm.x86.avx2.psubus.w, %a0, %a1) + ret i16 %ret +} + +define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index 070912ea..a1e6f915 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -37,62 +37,101 @@ rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; saturation arithmetic + declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone -define <16 x i8> @__vpadds_i8(<16 x i8> %a0, <16 x i8> %a1) { - ; CHECK: vpaddsb - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} - - -;;declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__padds_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__padds_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__paddus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__paddus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubs_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubs_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone define i8 @__psubus_i8(i8 %a0, i8 %a1) { sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) ret i8 %ret } +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone define i16 @__psubus_i16(i16 %a0, i16 %a1) { sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) ret i16 %ret } +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 50dd0582..70acca63 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -38,6 +38,104 @@ define_shuffles() aossoa() rdrand_decls() +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; saturation arithmetic + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__padds_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.padds.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__padds_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__paddus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.paddus.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__paddus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubs_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubs.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubs_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define i8 @__psubus_i8(i8 %a0, i8 %a1) { + sse_binary_scalar(ret, 16, i8, @llvm.x86.sse2.psubus.b, %a0, %a1) + ret i8 %ret +} + +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define i16 @__psubus_i16(i16 %a0, i16 %a1) { + sse_binary_scalar(ret, 8, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret i16 %ret +} + +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/stdlib.ispc b/stdlib.ispc index 464da5d4..5b3d144c 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4264,34 +4264,73 @@ static inline uniform int8 padds(uniform int8 a, uniform int8 b) { return __padds_i8(a, b); } +static inline varying int8 padds(varying int8 a, varying int8 b) { + return __padds_vi8(a, b); +} + static inline uniform int16 padds(uniform int16 a, uniform int16 b) { return __padds_i16(a, b); } -static inline uniform unsigned int8 paddus(uniform unsigned int8 a, uniform unsigned int8 b) { +static inline varying int16 padds(varying int16 a, varying int16 b) { + return __padds_vi16(a, b); +} + +static inline uniform unsigned int8 paddus(uniform unsigned int8 a, + uniform unsigned int8 b) { return __paddus_i8(a, b); } -static inline uniform unsigned int16 paddus(uniform unsigned int16 a, unsigned uniform int16 b) { +static inline varying unsigned int8 paddus(varying unsigned int8 a, + varying unsigned int8 b) { + return __paddus_vi8(a, b); +} + +static inline uniform unsigned int16 paddus(uniform unsigned int16 a, + unsigned uniform int16 b) { return __paddus_i16(a, b); } +static inline varying unsigned int16 paddus(varying unsigned int16 a, + unsigned varying int16 b) { + return __paddus_vi16(a, b); +} + static inline uniform int8 psubs(uniform int8 a, uniform int8 b) { return __psubs_i8(a, b); } +static inline varying int8 psubs(varying int8 a, varying int8 b) { + return __psubs_vi8(a, b); +} + static inline uniform int16 psubs(uniform int16 a, uniform int16 b) { return __psubs_i16(a, b); } -static inline uniform unsigned int8 psubus(uniform unsigned int8 a, uniform unsigned int8 b) { +static inline varying int16 psubs(varying int16 a, varying int16 b) { + return __psubs_vi16(a, b); +} + +static inline uniform unsigned int8 psubus(uniform unsigned int8 a, + uniform unsigned int8 b) { return __psubus_i8(a, b); } -static inline uniform unsigned int16 psubus(uniform unsigned int16 a, unsigned uniform int16 b) { +static inline varying unsigned int8 psubus(varying unsigned int8 a, + varying unsigned int8 b) { + return __psubus_vi8(a, b); +} + +static inline uniform unsigned int16 psubus(uniform unsigned int16 a, + unsigned uniform int16 b) { return __psubus_i16(a, b); } +static inline varying unsigned int16 psubus(varying unsigned int16 a, + unsigned varying int16 b) { + return __psubus_vi16(a, b); +} /////////////////////////////////////////////////////////////////////////// // rdrand