Add reduce_add() for int8 and int16 types.
This maps to specialized instructions (e.g. PSADBW) when available.
This commit is contained in:
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
|
||||
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
|
||||
<4 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <4 x i16> %0, %1
|
||||
ret <4 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
|
||||
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
|
||||
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
|
||||
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
|
||||
Reference in New Issue
Block a user