diff --git a/builtins.cpp b/builtins.cpp index fee322e7..2c1acc36 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -489,12 +489,20 @@ lSetInternalFunctions(llvm::Module *module) { "__packed_load_active", "__packed_store_active", "__packed_store_active2", + "__padds_vi8", + "__padds_vi16", + "__paddus_vi8", + "__paddus_vi16", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", "__prefetch_read_uniform_2", "__prefetch_read_uniform_3", "__prefetch_read_uniform_nt", + "__psubs_vi8", + "__psubs_vi16", + "__psubus_vi8", + "__psubus_vi16", "__rcp_uniform_float", "__rcp_varying_float", "__rcp_uniform_double", diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll index b3a77871..68a67133 100644 --- a/builtins/target-avx-x2.ll +++ b/builtins/target-avx-x2.ll @@ -40,6 +40,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-avx-common.ll') diff --git a/builtins/target-avx1-i64x4base.ll b/builtins/target-avx1-i64x4base.ll index a6601a28..19b47b1d 100644 --- a/builtins/target-avx1-i64x4base.ll +++ b/builtins/target-avx1-i64x4base.ll @@ -40,6 +40,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-avx-common.ll') diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 9c86cab8..a9ddc112 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -32,6 +32,7 @@ include(`target-avx.ll') rdrand_decls() +saturation_arithmetic() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index fea0a7c2..c4c421a0 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -35,6 +35,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') +saturation_arithmetic() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index f4a0ee07..20ecef47 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -39,6 +39,8 @@ ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', `rdrand_definition()') +saturation_arithmetic() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 3dcd8373..a3de92f3 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -10,6 +10,7 @@ packed_load_and_store() scans() int64minmax() aossoa() +saturation_arithmetic_novec() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll index 807fd242..cc5644bc 100644 --- a/builtins/target-generic-16.ll +++ b/builtins/target-generic-16.ll @@ -31,4 +31,4 @@ define(`WIDTH',`16') include(`target-generic-common.ll') - +saturation_arithmetic_novec() diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll index 5f89bcdf..8eb31c48 100644 --- a/builtins/target-generic-32.ll +++ b/builtins/target-generic-32.ll @@ -31,3 +31,4 @@ define(`WIDTH',`32') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll index 7eb1f300..d80c5b91 100644 --- a/builtins/target-generic-4.ll +++ b/builtins/target-generic-4.ll @@ -31,4 +31,4 @@ define(`WIDTH',`4') include(`target-generic-common.ll') - +saturation_arithmetic_novec() diff --git a/builtins/target-generic-64.ll b/builtins/target-generic-64.ll index 09443f8e..6a044c41 100644 --- a/builtins/target-generic-64.ll +++ b/builtins/target-generic-64.ll @@ -31,3 +31,4 @@ define(`WIDTH',`64') include(`target-generic-common.ll') +saturation_arithmetic_novec() diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll index bd9261ff..4353658c 100644 --- a/builtins/target-generic-8.ll +++ b/builtins/target-generic-8.ll @@ -31,4 +31,4 @@ define(`WIDTH',`8') include(`target-generic-common.ll') - +saturation_arithmetic_novec() diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index bfb927e5..9dcb064f 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -44,6 +44,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse2-common.ll') diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 93a8eb93..6a5709fd 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -41,6 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse2-common.ll') diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 0de5c1b4..c8f72d45 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -41,6 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 79f44212..4b394734 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -41,6 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ceff27f0..e87f4640 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -44,6 +44,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 9e2ac8a5..9819d385 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -41,6 +41,7 @@ stdlib_core() packed_load_and_store() scans() int64minmax() +saturation_arithmetic() include(`target-sse4-common.ll') diff --git a/builtins/util.m4 b/builtins/util.m4 index fbd929a1..025018e9 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,6 +49,416 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector convertation utilities +;; convert vector of one width into vector of other width +;; +;; $1: vector element type +;; $2: vector of the first width +;; $3: vector of the second width + + +define(`convert1to8', ` + $3 = shufflevector <1 x $1> $2, <1 x $1> undef, + <8 x i32> +') + + +define(`convert1to16', ` + $3 = shufflevector <1 x $1> $2, <1 x $1> undef, + <16 x i32> +') + +define(`convert4to8', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <8 x i32> +') + +define(`convert4to16', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <16 x i32> +') + +define(`convert8to16', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <16 x i32> +') + +define(`convert4to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert8to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert16to32', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, + <32 x i32> +') + +define(`convert8to1', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <1 x i32> +') + + +define(`convert16to1', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <1 x i32> +') + +define(`convert8to4', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + + +define(`convert16to4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <4 x i32> +') + +define(`convert16to8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +define(`convert32to4', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <4 x i32> +') + +define(`convert32to8', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <8 x i32> +') + +define(`convert32to16', ` + $3 = shufflevector <32 x $1> $2, <32 x $1> undef, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;saturation arithmetic + +define(`saturation_arithmetic', +`ifelse(WIDTH, `4', `saturation_arithmetic_vec4()', + WIDTH, `8', `saturation_arithmetic_vec8()', + WIDTH, `16', `saturation_arithmetic_vec16() ', + `errprint(`ERROR: saturation_arithmetic() macro called with unsupported width = 'WIDTH +) + m4exit(`1')') +') + +;; create vector constant. Used by saturation_arithmetic_novec_universal below. + +define(`const_vector', ` +ifelse(WIDTH, `4', `<$1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `8', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `16', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `32', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + WIDTH, `64', `<$1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, + $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2, $1 $2>', + `<$1 $2>')') + +;; utility function used by saturation_arithmetic_novec below. This shouldn't be called by +;; target .ll files directly. +;; $1: {add,sub} (used in constructing function names) + +define(`saturation_arithmetic_novec_universal', ` +define @__p$1s_vi8(, ) { + %v0_i16 = sext %0 to + %v1_i16 = sext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp sgt %res, const_vector(i16, 127) + %over_res = select %over_mask, const_vector(i16, 127), %res + %under_mask = icmp slt %res, const_vector(i16, -128) + %ret_i16 = select %under_mask, const_vector(i16, -128), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1s_vi16(, ) { + %v0_i32 = sext %0 to + %v1_i32 = sext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp sgt %res, const_vector(i32, 32767) + %over_res = select %over_mask, const_vector(i32, 32767), %res + %under_mask = icmp slt %res, const_vector(i32, -32768) + %ret_i32 = select %under_mask, const_vector(i32, -32768), %over_res + %ret = trunc %ret_i32 to + ret %ret +} + +define @__p$1us_vi8(, ) { + %v0_i16 = zext %0 to + %v1_i16 = zext %1 to + %res = $1 %v0_i16, %v1_i16 + %over_mask = icmp ugt %res, const_vector(i16, 255) + %over_res = select %over_mask, const_vector(i16, 255), %res + %under_mask = icmp slt %res, const_vector(i16, 0) + %ret_i16 = select %under_mask, const_vector(i16, 0), %over_res + %ret = trunc %ret_i16 to + ret %ret +} + +define @__p$1us_vi16(, ) { + %v0_i32 = zext %0 to + %v1_i32 = zext %1 to + %res = $1 %v0_i32, %v1_i32 + %over_mask = icmp ugt %res, const_vector(i32, 65535) + %over_res = select %over_mask, const_vector(i32, 65535), %res + %under_mask = icmp slt %res, const_vector(i32, 0) + %ret_i32 = select %under_mask, const_vector(i32, 0), %over_res + %ret = trunc %ret_i32 to + ret %ret +} +') + +;; implementation for targets which doesn't have h/w instructions + +define(`saturation_arithmetic_novec', ` +saturation_arithmetic_novec_universal(sub) +saturation_arithmetic_novec_universal(add) +') + +;;4-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec4', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define <4 x i8> @__padds_vi8(<4 x i8>, <4 x i8>) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret <4 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i16> @__padds_vi16(<4 x i16>, <4 x i16>) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret <4 x i16> %r +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <4 x i8> @__paddus_vi8(<4 x i8>, <4 x i8>) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret <4 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i16> @__paddus_vi16(<4 x i16>, <4 x i16>) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret <4 x i16> %r +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define <4 x i8> @__psubs_vi8(<4 x i8>, <4 x i8>) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret <4 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i16> @__psubs_vi16(<4 x i16>, <4 x i16>) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret <4 x i16> %r +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <4 x i8> @__psubus_vi8(<4 x i8>, <4 x i8>) { + convert4to16(i8, %0, %v0) + convert4to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to4(i8, %r16, %r) + ret <4 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i16> @__psubus_vi16(<4 x i16>, <4 x i16>) { + convert4to8(i16, %0, %v0) + convert4to8(i16, %1, %v1) + %r16 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %v0, <8 x i16> %v1) + convert8to4(i16, %r16, %r) + ret <4 x i16> %r +} +') + +;;8-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec8', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define <8 x i8> @__padds_vi8(<8 x i8>, <8 x i8>) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define <8 x i16> @__padds_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <8 x i8> @__paddus_vi8(<8 x i8>, <8 x i8>) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <8 x i16> @__paddus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define <8 x i8> @__psubs_vi8(<8 x i8>, <8 x i8>) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define <8 x i16> @__psubs_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <8 x i8> @__psubus_vi8(<8 x i8>, <8 x i8>) { + convert8to16(i8, %0, %v0) + convert8to16(i8, %1, %v1) + %r16 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %v0, <16 x i8> %v1) + convert16to8(i8, %r16, %r) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <8 x i16> @__psubus_vi16(<8 x i16> %a0, <8 x i16> %a1) { + %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} +') + +;;16-wide vector saturation arithmetic + +define(`saturation_arithmetic_vec16', ` +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__padds_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i16> @__padds_vi16(<16 x i16> %a0, <16 x i16> %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.padds.w, %a0, %a1) + ret <16 x i16> %ret +} + +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__paddus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i16> @__paddus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.paddus.w, %a0, %a1) + ret <16 x i16> %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__psubs_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i16> @__psubs_vi16(<16 x i16> %a0, <16 x i16> %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubs.w, %a0, %a1) + ret <16 x i16> %ret +} + +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone +define <16 x i8> @__psubus_vi8(<16 x i8> %a0, <16 x i8> %a1) { + %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} + +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i16> @__psubus_vi16(<16 x i16> %a0, <16 x i16> %a1) { + binary8to16(ret, i16, @llvm.x86.sse2.psubus.w, %a0, %a1) + ret <16 x i16> %ret +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors ;; diff --git a/stdlib.ispc b/stdlib.ispc index 24217cd0..cb41c49c 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -57,6 +57,43 @@ #error Unknown value of ISPC_MASK_BITS #endif +/* Limits of integral types. */ +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef INT64_MAX +#define INT64_MAX (9223372036854775807) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295) +#endif +#ifndef UINT64_MAX +#define UINT64_MAX (18446744073709551615) +#endif +#ifndef INT8_MIN +#define INT8_MIN (-INT8_MAX - 1) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-INT16_MAX - 1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-INT32_MAX - 1) +#endif +#ifndef INT64_MIN +#define INT64_MIN (-INT64_MAX - 1) +#endif /////////////////////////////////////////////////////////////////////////// // Low level primitives @@ -4345,6 +4382,108 @@ static inline void fastmath() { __fastmath(); } +/////////////////////////////////////////////////////////////////////////// +// saturation arithmetic + +static inline uniform int8 saturating_add(uniform int8 a, uniform int8 b) { + uniform unsigned int8 a_unsig = a, b_unsig = b; + uniform unsigned int8 result = a_unsig + b_unsig; + a_unsig = (a_unsig >> 7) + INT8_MAX; + if ((uniform int8) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0) + result = a_unsig; + return result; +} + +static inline varying int8 saturating_add(varying int8 a, varying int8 b) { + return __padds_vi8(a, b); +} + +static inline uniform int16 saturating_add(uniform int16 a, uniform int16 b) { + uniform unsigned int16 a_unsig = a, b_unsig = b; + uniform unsigned int16 result = a_unsig + b_unsig; + a_unsig = (a_unsig >> 15) + INT16_MAX; + if ((uniform int16) ((a_unsig ^ b_unsig) | ~(b_unsig ^ result)) >= 0) + result = a_unsig; + return result; +} + +static inline varying int16 saturating_add(varying int16 a, varying int16 b) { + return __padds_vi16(a, b); +} + +static inline uniform unsigned int8 saturating_add(uniform unsigned int8 a, + uniform unsigned int8 b) { + uniform unsigned int8 result = a + b; + result |= (-(uniform int8)(result < a)); + return result; +} + +static inline varying unsigned int8 saturating_add(varying unsigned int8 a, + varying unsigned int8 b) { + return __paddus_vi8(a, b); +} + +static inline uniform unsigned int16 saturating_add(uniform unsigned int16 a, + uniform unsigned int16 b) { + uniform unsigned int16 result = a + b; + result |= (-(uniform int16)(result < a)); + return result; +} + +static inline varying unsigned int16 saturating_add(varying unsigned int16 a, + varying unsigned int16 b) { + return __paddus_vi16(a, b); +} + +static inline uniform int8 saturating_sub(uniform int8 a, uniform int8 b) { + uniform unsigned int8 a_unsig = a, b_unsig = b; + uniform unsigned int8 result = a_unsig - b_unsig; + a_unsig = (a_unsig >> 7) + INT8_MAX; + if ((uniform int8) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0) + result = a_unsig; + return result; +} + +static inline varying int8 saturating_sub(varying int8 a, varying int8 b) { + return __psubs_vi8(a, b); +} + +static inline uniform int16 saturating_sub(uniform int16 a, uniform int16 b) { + uniform unsigned int16 a_unsig = a, b_unsig = b; + uniform unsigned int16 result = a_unsig - b_unsig; + a_unsig = (a_unsig >> 15) + INT16_MAX; + if ((uniform int16) ((a_unsig ^ b_unsig) & (a_unsig ^ result)) < 0) + result = a_unsig; + return result; +} + +static inline varying int16 saturating_sub(varying int16 a, varying int16 b) { + return __psubs_vi16(a, b); +} + +static inline uniform unsigned int8 saturating_sub(uniform unsigned int8 a, + uniform unsigned int8 b) { + uniform unsigned int8 result = a - b; + result &= (-(uniform int8)(result <= a)); + return result; +} + +static inline varying unsigned int8 saturating_sub(varying unsigned int8 a, + varying unsigned int8 b) { + return __psubus_vi8(a, b); +} + +static inline uniform unsigned int16 saturating_sub(uniform unsigned int16 a, + uniform unsigned int16 b) { + uniform unsigned int16 result = a - b; + result &= (-(uniform int16)(result <= a)); + return result; +} + +static inline varying unsigned int16 saturating_sub(varying unsigned int16 a, + varying unsigned int16 b) { + return __psubus_vi16(a, b); +} /////////////////////////////////////////////////////////////////////////// // rdrand diff --git a/tests/padds_i16.ispc b/tests/padds_i16.ispc new file mode 100644 index 00000000..c763dd37 --- /dev/null +++ b/tests/padds_i16.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int16) 32767; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int16) -32768; + } + else { + RET[programIndex] = (uniform int16) -32763; + } +} diff --git a/tests/padds_i8.ispc b/tests/padds_i8.ispc new file mode 100644 index 00000000..7d272828 --- /dev/null +++ b/tests/padds_i8.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int8) 127; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int8) -128; + } + else { + RET[programIndex] = (uniform int8) -123; + } +} diff --git a/tests/padds_vi16.ispc b/tests/padds_vi16.ispc new file mode 100644 index 00000000..5834a47a --- /dev/null +++ b/tests/padds_vi16.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int16) 32767; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int16) -32768; + } + else { + RET[programIndex] = (varying int16) -32763; + } +} diff --git a/tests/padds_vi8.ispc b/tests/padds_vi8.ispc new file mode 100644 index 00000000..0aca03d4 --- /dev/null +++ b/tests/padds_vi8.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_add(a_min, -b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int8) 127; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int8) -128; + } + else { + RET[programIndex] = (varying int8) -123; + } +} diff --git a/tests/paddus_i16.ispc b/tests/paddus_i16.ispc new file mode 100644 index 00000000..2032f161 --- /dev/null +++ b/tests/paddus_i16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int16) 65535; + } + else { + RET[programIndex] = (uniform unsigned int16) 5; + } +} diff --git a/tests/paddus_i8.ispc b/tests/paddus_i8.ispc new file mode 100644 index 00000000..97436a86 --- /dev/null +++ b/tests/paddus_i8.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int8) 255; + } + else { + RET[programIndex] = (uniform unsigned int8) 5; + } +} diff --git a/tests/paddus_vi16.ispc b/tests/paddus_vi16.ispc new file mode 100644 index 00000000..d8bfa000 --- /dev/null +++ b/tests/paddus_vi16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int16) 65535; + } + else { + RET[programIndex] = (varying unsigned int16) 5; + } +} diff --git a/tests/paddus_vi8.ispc b/tests/paddus_vi8.ispc new file mode 100644 index 00000000..59baa6fb --- /dev/null +++ b/tests/paddus_vi8.ispc @@ -0,0 +1,22 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_add(a_max, b); + } + else { + RET[programIndex] = saturating_add(a_min, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int8) 255; + } + else { + RET[programIndex] = (varying unsigned int8) 5; + } +} + diff --git a/tests/psubs_i16.ispc b/tests/psubs_i16.ispc new file mode 100644 index 00000000..4f27b3b4 --- /dev/null +++ b/tests/psubs_i16.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int16) -32768; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int16) 32767; + } + else { + RET[programIndex] = (uniform int16) 32762; + } +} diff --git a/tests/psubs_i8.ispc b/tests/psubs_i8.ispc new file mode 100644 index 00000000..e04867bd --- /dev/null +++ b/tests/psubs_i8.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (uniform int8) -128; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (uniform int8) 127; + } + else { + RET[programIndex] = (uniform int8) 122; + } +} diff --git a/tests/psubs_vi16.ispc b/tests/psubs_vi16.ispc new file mode 100644 index 00000000..df130115 --- /dev/null +++ b/tests/psubs_vi16.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int16 a_max = 32767, a_min = -32768; // max and min signed int16 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int16) -32768; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int16) 32767; + } + else { + RET[programIndex] = (varying int16) 32762; + } +} diff --git a/tests/psubs_vi8.ispc b/tests/psubs_vi8.ispc new file mode 100644 index 00000000..d7e9ff89 --- /dev/null +++ b/tests/psubs_vi8.ispc @@ -0,0 +1,27 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying int8 a_max = 127, a_min = -128; // max and min signed int8 + if (programIndex % 3 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else if (programIndex % 3 == 1) { + RET[programIndex] = saturating_sub(a_max, -b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 3 == 0) { + RET[programIndex] = (varying int8) -128; + } + else if (programIndex % 3 == 1) { + RET[programIndex] = (varying int8) 127; + } + else { + RET[programIndex] = (varying int8) 122; + } +} diff --git a/tests/psubus_i16.ispc b/tests/psubus_i16.ispc new file mode 100644 index 00000000..f9ae3568 --- /dev/null +++ b/tests/psubus_i16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int16) 0; + } + else { + RET[programIndex] = (uniform unsigned int16) 65530; + } +} diff --git a/tests/psubus_i8.ispc b/tests/psubus_i8.ispc new file mode 100644 index 00000000..e6f30b2a --- /dev/null +++ b/tests/psubus_i8.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + uniform unsigned int8 a_max = 255, a_min = 0; // max and min unsigned int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (uniform unsigned int8) 0; + } + else { + RET[programIndex] = (uniform unsigned int8) 250; + } +} diff --git a/tests/psubus_vi16.ispc b/tests/psubus_vi16.ispc new file mode 100644 index 00000000..0974cc5e --- /dev/null +++ b/tests/psubus_vi16.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int16 a_max = 65535, a_min = 0; // max and min unsigned int16 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int16) 0; + } + else { + RET[programIndex] = (varying unsigned int16) 65530; + } +} diff --git a/tests/psubus_vi8.ispc b/tests/psubus_vi8.ispc new file mode 100644 index 00000000..f7ad65d3 --- /dev/null +++ b/tests/psubus_vi8.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + varying unsigned int8 a_max = 255, a_min = 0; // max and min signed int8 + if (programIndex % 2 == 0) { + RET[programIndex] = saturating_sub(a_min, b); + } + else { + RET[programIndex] = saturating_sub(a_max, b); + } +} + +export void result(uniform float RET[]) { + if (programIndex % 2 == 0) { + RET[programIndex] = (varying unsigned int8) 0; + } + else { + RET[programIndex] = (varying unsigned int8) 250; + } +}