Add avg_{up,down}_int{8,16} routines to stdlib

These compute the average of two given values, rounding up and down,
respectively, if the result isn't exact.  When possible, these are
mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US]
on NEON.)

A subsequent commit will add pattern-matching to generate calls to
these intrinsincs when the corresponding patterns are detected in the
IR.)
This commit is contained in:
Matt Pharr
2013-08-03 20:44:25 -07:00
parent 4f48d3258a
commit 5b20b06bd9
23 changed files with 592 additions and 15 deletions

View File

@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
%r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
ret <8 x i8> %r
}
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}

View File

@@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16
declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
%r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
ret <4 x i8> %r
}
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}
declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
ret <4 x i16> %r
}

View File

@@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}

View File

@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
ret i64 %val
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -449,3 +449,34 @@ gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
%r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
ret <8 x i16> %r
}
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()

View File

@@ -456,3 +456,28 @@ gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
%r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
ret <16 x i8> %r
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
v16tov8(i16, %0, %a0, %b0)
v16tov8(i16, %1, %a1, %b1)
%r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
%r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
v8tov16(i16, %r0, %r1, %r)
ret <16 x i16> %r
}
define_avg_up_int8()
define_avg_up_int16()
define_down_avgs()

View File

@@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
ret <8 x double> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -473,3 +473,9 @@ gen_scatter(i32)
gen_scatter(float)
gen_scatter(i64)
gen_scatter(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define_avgs()

View File

@@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON',
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector assembly and deconstruction utilities
;; vector deconstruction utilities
;; split 8-wide vector into 2 4-wide vectors
;;
;;
;; $1: vector element type
;; $2: 8-wide vector
;; $3: first 4-wide vector
@@ -71,10 +71,6 @@ define(`v16tov8', `
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
;; 4-wide into 2 2-wide
;; args as above
;;
define(`v4tov2', `
$3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
$4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
@@ -96,6 +92,20 @@ define(`v16tov4', `
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector assembly: wider vector from two narrower vectors
;;
;; $1: vector element type
;; $2: first n-wide vector
;; $3: second n-wide vector
;; $4: result 2*n-wide vector
define(`v8tov16', `
$4 = shufflevector <8 x $1> $2, <8 x $1> $3,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Helper macro for calling various SSE instructions for scalar values
;; but where the instruction takes a vector parameter.
;; $1 : name of variable to put the final value in
@@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
ret i1 %good
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins
define(`define_avg_up_uint8', `
define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
%sum1 = add <WIDTH x i16> %a16, %b16
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_up_int8', `
define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
%sum1 = add <WIDTH x i16> %a16, %b16
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_up_uint16', `
define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
%sum1 = add <WIDTH x i32> %a32, %b32
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_up_int16', `
define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
%sum1 = add <WIDTH x i32> %a32, %b32
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_down_uint8', `
define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
%sum = add <WIDTH x i16> %a16, %b16
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_down_int8', `
define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
%sum = add <WIDTH x i16> %a16, %b16
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
ret <WIDTH x i8> %r
}')
define(`define_avg_down_uint16', `
define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
%sum = add <WIDTH x i32> %a32, %b32
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_avg_down_int16', `
define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
%sum = add <WIDTH x i32> %a32, %b32
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
ret <WIDTH x i16> %r
}')
define(`define_up_avgs', `
define_avg_up_uint8()
define_avg_up_int8()
define_avg_up_uint16()
define_avg_up_int16()
')
define(`define_down_avgs', `
define_avg_down_uint8()
define_avg_down_int8()
define_avg_down_uint16()
define_avg_down_int16()
')
define(`define_avgs', `
define_up_avgs()
define_down_avgs()
')