Add avg_{up,down}_int{8,16} routines to stdlib
These compute the average of two given values, rounding up and down, respectively, if the result isn't exact. When possible, these are mapped to target-specific intrinsics (PADD[BW] on IA and VH[R]ADD[US] on NEON.) A subsequent commit will add pattern-matching to generate calls to these intrinsincs when the corresponding patterns are detected in the IR.)
This commit is contained in:
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
|
||||
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
|
||||
ret double %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -864,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
|
||||
declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
|
||||
declare i16 @__float_to_half_uniform(float %v) nounwind readnone
|
||||
declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -364,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind
|
||||
declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -456,3 +456,62 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
|
||||
%r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
|
||||
%r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
@@ -426,3 +426,62 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
|
||||
define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
|
||||
reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16
|
||||
|
||||
declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
|
||||
|
||||
define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
|
||||
%r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
|
||||
ret <4 x i8> %r
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
|
||||
|
||||
define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
|
||||
%r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
@@ -506,3 +506,78 @@ define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
|
||||
reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -449,3 +449,34 @@ gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
|
||||
%v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef,
|
||||
i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
%r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
|
||||
%r = shufflevector <16 x i8> %r16, <16 x i8> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
|
||||
%r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
|
||||
@@ -456,3 +456,28 @@ gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
|
||||
%r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
|
||||
ret <16 x i8> %r
|
||||
}
|
||||
|
||||
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
|
||||
|
||||
define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
|
||||
v16tov8(i16, %0, %a0, %b0)
|
||||
v16tov8(i16, %1, %a1, %b1)
|
||||
%r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
|
||||
%r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
|
||||
v8tov16(i16, %r0, %r1, %r)
|
||||
ret <16 x i16> %r
|
||||
}
|
||||
|
||||
define_avg_up_int8()
|
||||
define_avg_up_int16()
|
||||
define_down_avgs()
|
||||
|
||||
@@ -573,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
|
||||
binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
|
||||
ret <8 x double> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
@@ -473,3 +473,9 @@ gen_scatter(i32)
|
||||
gen_scatter(float)
|
||||
gen_scatter(i64)
|
||||
gen_scatter(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
|
||||
128
builtins/util.m4
128
builtins/util.m4
@@ -49,9 +49,9 @@ define(`MASK_HIGH_BIT_ON',
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector assembly and deconstruction utilities
|
||||
;; vector deconstruction utilities
|
||||
;; split 8-wide vector into 2 4-wide vectors
|
||||
;;
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: 8-wide vector
|
||||
;; $3: first 4-wide vector
|
||||
@@ -71,10 +71,6 @@ define(`v16tov8', `
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
;; 4-wide into 2 2-wide
|
||||
;; args as above
|
||||
;;
|
||||
|
||||
define(`v4tov2', `
|
||||
$3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
|
||||
$4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
|
||||
@@ -96,6 +92,20 @@ define(`v16tov4', `
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; vector assembly: wider vector from two narrower vectors
|
||||
;;
|
||||
;; $1: vector element type
|
||||
;; $2: first n-wide vector
|
||||
;; $3: second n-wide vector
|
||||
;; $4: result 2*n-wide vector
|
||||
define(`v8tov16', `
|
||||
$4 = shufflevector <8 x $1> $2, <8 x $1> $3,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
;; Helper macro for calling various SSE instructions for scalar values
|
||||
;; but where the instruction takes a vector parameter.
|
||||
;; $1 : name of variable to put the final value in
|
||||
@@ -4276,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
|
||||
ret i1 %good
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int8/int16 builtins
|
||||
|
||||
define(`define_avg_up_uint8', `
|
||||
define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum1 = add <WIDTH x i16> %a16, %b16
|
||||
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_int8', `
|
||||
define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum1 = add <WIDTH x i16> %a16, %b16
|
||||
%sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_uint16', `
|
||||
define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum1 = add <WIDTH x i32> %a32, %b32
|
||||
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_up_int16', `
|
||||
define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum1 = add <WIDTH x i32> %a32, %b32
|
||||
%sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_uint8', `
|
||||
define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum = add <WIDTH x i16> %a16, %b16
|
||||
%avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_int8', `
|
||||
define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
|
||||
%a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
|
||||
%b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
|
||||
%sum = add <WIDTH x i16> %a16, %b16
|
||||
%avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
|
||||
%r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
|
||||
ret <WIDTH x i8> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_uint16', `
|
||||
define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum = add <WIDTH x i32> %a32, %b32
|
||||
%avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_avg_down_int16', `
|
||||
define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
|
||||
%a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
|
||||
%b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
|
||||
%sum = add <WIDTH x i32> %a32, %b32
|
||||
%avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
|
||||
%r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
|
||||
ret <WIDTH x i16> %r
|
||||
}')
|
||||
|
||||
define(`define_up_avgs', `
|
||||
define_avg_up_uint8()
|
||||
define_avg_up_int8()
|
||||
define_avg_up_uint16()
|
||||
define_avg_up_int16()
|
||||
')
|
||||
|
||||
define(`define_down_avgs', `
|
||||
define_avg_down_uint8()
|
||||
define_avg_down_int8()
|
||||
define_avg_down_uint16()
|
||||
define_avg_down_int16()
|
||||
')
|
||||
|
||||
define(`define_avgs', `
|
||||
define_up_avgs()
|
||||
define_down_avgs()
|
||||
')
|
||||
|
||||
Reference in New Issue
Block a user