From 5b20b06bd9c75d84e78749b752716d6f2088b8d1 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 3 Aug 2013 20:44:25 -0700 Subject: [PATCH] Add avg_{up,down}_int{8,16} routines to stdlib These compute the average of two given values, rounding up and down, respectively, if the result isn't exact. When possible, these are mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US] on NEON.) A subsequent commit will add pattern-matching to generate calls to these intrinsincs when the corresponding patterns are detected in the IR.) --- builtins/target-avx-common.ll | 6 ++ builtins/target-generic-1.ll | 6 ++ builtins/target-generic-common.ll | 5 ++ builtins/target-neon-16.ll | 59 ++++++++++++++ builtins/target-neon-32.ll | 59 ++++++++++++++ builtins/target-neon-8.ll | 75 +++++++++++++++++ builtins/target-sse2-common.ll | 4 + builtins/target-sse4-16.ll | 31 ++++++++ builtins/target-sse4-8.ll | 25 ++++++ builtins/target-sse4-x2.ll | 6 ++ builtins/target-sse4.ll | 6 ++ builtins/util.m4 | 128 ++++++++++++++++++++++++++++-- docs/ispc.rst | 25 ++++++ opt.cpp | 8 ++ stdlib.ispc | 60 +++++++++++--- tests/avg-down-int16.ispc | 13 +++ tests/avg-down-int8.ispc | 13 +++ tests/avg-down-uint16.ispc | 13 +++ tests/avg-down-uint8.ispc | 13 +++ tests/avg-up-int16.ispc | 13 +++ tests/avg-up-int8.ispc | 13 +++ tests/avg-up-uint16.ispc | 13 +++ tests/avg-up-uint8.ispc | 13 +++ 23 files changed, 592 insertions(+), 15 deletions(-) create mode 100644 tests/avg-down-int16.ispc create mode 100644 tests/avg-down-int8.ispc create mode 100644 tests/avg-down-uint16.ispc create mode 100644 tests/avg-down-uint8.ispc create mode 100644 tests/avg-up-int16.ispc create mode 100644 tests/avg-up-int8.ispc create mode 100644 tests/avg-up-uint16.ispc create mode 100644 tests/avg-up-uint8.ispc diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index dcbe0a66..1d317713 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) ret double %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 238de444..3472c207 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone declare @__half_to_float_varying( %v) nounwind readnone declare i16 @__float_to_half_uniform(float %v) nounwind readnone declare @__float_to_half_varying( %v) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index b581e0a7..c683ff45 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll index fd15eb0b..a0575927 100644 --- a/builtins/target-neon-16.ll +++ b/builtins/target-neon-16.ll @@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64() nounwind readnone { define i64 @__reduce_max_uint64() nounwind readnone { reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone + +define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone { + %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1) + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone { + %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} diff --git a/builtins/target-neon-32.ll b/builtins/target-neon-32.ll index 1f8003d7..30b062c9 100644 --- a/builtins/target-neon-32.ll +++ b/builtins/target-neon-32.ll @@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 + +declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone + +define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone { + %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1) + ret <4 x i8> %r +} + +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} + +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone + +define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone { + %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1) + ret <4 x i16> %r +} diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll index eb65f224..2accfe53 100644 --- a/builtins/target-neon-8.ll +++ b/builtins/target-neon-8.ll @@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64() nounwind readnone { define i64 @__reduce_max_uint64() nounwind readnone { reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64) } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index c6a3afe2..ad1d88bc 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { ret i64 %val } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll index 50f0848d..b4772552 100644 --- a/builtins/target-sse4-16.ll +++ b/builtins/target-sse4-16.ll @@ -449,3 +449,34 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) { + %v0 = shufflevector <8 x i8> %0, <8 x i8> undef, + <16 x i32> + %v1 = shufflevector <8 x i8> %1, <8 x i8> undef, + <16 x i32> + %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1) + %r = shufflevector <16 x i8> %r16, <16 x i8> undef, + <8 x i32> + ret <8 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) { + %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1) + ret <8 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll index 7fa9075b..a75d8e3a 100644 --- a/builtins/target-sse4-8.ll +++ b/builtins/target-sse4-8.ll @@ -456,3 +456,28 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone { + %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1) + ret <16 x i8> %r +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone { + v16tov8(i16, %0, %a0, %b0) + v16tov8(i16, %1, %a1, %b1) + %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1) + v8tov16(i16, %r0, %r1, %r) + ret <16 x i16> %r +} + +define_avg_up_int8() +define_avg_up_int16() +define_down_avgs() diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index 4a447ba6..897a09eb 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1) ret <8 x double> %ret } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index 7f9a9185..5429b461 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -473,3 +473,9 @@ gen_scatter(i32) gen_scatter(float) gen_scatter(i64) gen_scatter(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + diff --git a/builtins/util.m4 b/builtins/util.m4 index 025030d5..95e3844d 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON', ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; vector assembly and deconstruction utilities +;; vector deconstruction utilities ;; split 8-wide vector into 2 4-wide vectors -;; +;; ;; $1: vector element type ;; $2: 8-wide vector ;; $3: first 4-wide vector @@ -71,10 +71,6 @@ define(`v16tov8', ` <8 x i32> ') -;; 4-wide into 2 2-wide -;; args as above -;; - define(`v4tov2', ` $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> @@ -96,6 +92,20 @@ define(`v16tov4', ` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Helper macro for calling various SSE instructions for scalar values ;; but where the instruction takes a vector parameter. ;; $1 : name of variable to put the final value in @@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) { ret i1 %good } ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/docs/ispc.rst b/docs/ispc.rst index 8456f126..eb8333de 100755 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3399,6 +3399,31 @@ The ``isnan()`` functions test whether the given value is a floating-point uniform bool isnan(uniform double v) +A number of functions are also available for performing operations on 8- and +16-bit quantities; these map to specialized instructions that perform these +operations on targets that support them. ``avg_up()`` computes the average +of the two values, rounding up if their average is halfway between two +integers (i.e., it computes ``(a+b+1)/2``). + +:: + + int8 avg_up(int8 a, int8 b) + unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) + int16 avg_up(int16 a, int16 b) + unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) + + +``avg_down()`` computes the average of the two values, rounding down (i.e., +it computes ``(a+b)/2``). + +:: + + int8 avg_down(int8 a, int8 b) + unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) + int16 avg_down(int16 a, int16 b) + unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) + + Transcendental Functions ------------------------ diff --git a/opt.cpp b/opt.cpp index 8c86368e..b363f0e1 100644 --- a/opt.cpp +++ b/opt.cpp @@ -4343,6 +4343,14 @@ char MakeInternalFuncsStaticPass::ID = 0; bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { + "__avg_up_uint8", + "__avg_up_int8", + "__avg_up_uint16", + "__avg_up_int16", + "__avg_down_uint8", + "__avg_down_int8", + "__avg_down_uint16", + "__avg_down_int16", "__fast_masked_vload", "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", diff --git a/stdlib.ispc b/stdlib.ispc index affa7fef..dc94d7e3 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4812,8 +4812,8 @@ static const uniform int64 __idiv_table_s32[][3] = { }; __declspec(safe) -static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, - uniform unsigned int8 divisor) { +static unmasked inline unsigned int8 +__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) { uniform int64 method = __idiv_table_u8[divisor-2][0]; uniform int64 multiplier = __idiv_table_u8[divisor-2][1]; uniform int64 shift = __idiv_table_u8[divisor-2][2]; @@ -4833,7 +4833,7 @@ static unmasked unsigned int8 __fast_idiv(unsigned int8 numerator, } __declspec(safe) -static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { +static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) { uniform int8 method = __idiv_table_s8[divisor-2][0]; uniform int16 multiplier = __idiv_table_s8[divisor-2][1]; uniform int8 shift = __idiv_table_s8[divisor-2][2]; @@ -4850,8 +4850,8 @@ static unmasked int8 __fast_idiv(int8 numerator, uniform int8 divisor) { } __declspec(safe) -static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, - uniform unsigned int16 divisor) { +static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator, + uniform unsigned int16 divisor) { uniform int64 method = __idiv_table_u16[divisor-2][0]; uniform int64 multiplier = __idiv_table_u16[divisor-2][1]; uniform int64 shift = __idiv_table_u16[divisor-2][2]; @@ -4871,7 +4871,7 @@ static unmasked unsigned int16 __fast_idiv(unsigned int16 numerator, } __declspec(safe) -static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { +static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) { uniform int64 method = __idiv_table_s16[divisor-2][0]; uniform int64 multiplier = __idiv_table_s16[divisor-2][1]; uniform int64 shift = __idiv_table_s16[divisor-2][2]; @@ -4889,8 +4889,8 @@ static unmasked int16 __fast_idiv(int16 numerator, uniform int16 divisor) { } __declspec(safe) -static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, - uniform unsigned int32 divisor) { +static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator, + uniform unsigned int32 divisor) { uniform int64 method = __idiv_table_u32[divisor-2][0]; uniform int64 multiplier = __idiv_table_u32[divisor-2][1]; uniform int64 shift = __idiv_table_u32[divisor-2][2]; @@ -4910,7 +4910,7 @@ static unmasked inline unsigned int32 __fast_idiv(unsigned int32 numerator, } __declspec(safe) -static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { +static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) { uniform int64 method = __idiv_table_s32[divisor-2][0]; uniform int64 multiplier = __idiv_table_s32[divisor-2][1]; uniform int64 shift = __idiv_table_s32[divisor-2][2]; @@ -4927,3 +4927,45 @@ static unmasked int32 __fast_idiv(int32 numerator, uniform int32 divisor) { } } +/////////////////////////////////////////////////////////////////////////// +// Saturating int8/int16 ops + +__declspec(safe) +static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) { + return __avg_up_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_up(int8 a, int8 b) { + return __avg_up_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) { + return __avg_up_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_up(int16 a, int16 b) { + return __avg_up_int16(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) { + return __avg_down_uint8(a, b); +} + +__declspec(safe) +static unmasked inline int8 avg_down(int8 a, int8 b) { + return __avg_down_int8(a, b); +} + +__declspec(safe) +static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) { + return __avg_down_uint16(a, b); +} + +__declspec(safe) +static unmasked inline int16 avg_down(int16 a, int16 b) { + return __avg_down_int16(a, b); +} diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc new file mode 100644 index 00000000..10a3c2a2 --- /dev/null +++ b/tests/avg-down-int16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int16 a = aFOO[programIndex]; + int16 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc new file mode 100644 index 00000000..67638934 --- /dev/null +++ b/tests/avg-down-int8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int8 a = aFOO[programIndex]; + int8 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc new file mode 100644 index 00000000..70f9185e --- /dev/null +++ b/tests/avg-down-uint16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int16 a = aFOO[programIndex]; + unsigned int16 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc new file mode 100644 index 00000000..75fbf116 --- /dev/null +++ b/tests/avg-down-uint8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int8 a = aFOO[programIndex]; + unsigned int8 b = bf; + RET[programIndex] = avg_down(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5) / 2; +} diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc new file mode 100644 index 00000000..8f557a5b --- /dev/null +++ b/tests/avg-up-int16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int16 a = aFOO[programIndex]; + int16 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc new file mode 100644 index 00000000..d0a3b444 --- /dev/null +++ b/tests/avg-up-int8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + int8 a = aFOO[programIndex]; + int8 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc new file mode 100644 index 00000000..273f9f3b --- /dev/null +++ b/tests/avg-up-uint16.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int16 a = aFOO[programIndex]; + unsigned int16 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +} diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc new file mode 100644 index 00000000..d5d02491 --- /dev/null +++ b/tests/avg-up-uint8.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) { + unsigned int8 a = aFOO[programIndex]; + unsigned int8 b = bf; + RET[programIndex] = avg_up(a, b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2; +}