Add reduce_add() for int8 and int16 types.
This maps to specialized instructions (e.g. PSADBW) when available.
This commit is contained in:
@@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
|
||||
ret i1 %cmp
|
||||
}
|
||||
|
||||
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
|
||||
|
||||
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
|
||||
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
|
||||
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
|
||||
<16 x i8> zeroinitializer)
|
||||
%r0 = extractelement <2 x i64> %rv, i32 0
|
||||
%r1 = extractelement <2 x i64> %rv, i32 1
|
||||
%r = add i64 %r0, %r1
|
||||
%r16 = trunc i64 %r to i16
|
||||
ret i16 %r16
|
||||
}
|
||||
|
||||
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
|
||||
<8 x i16>) nounwind readnone alwaysinline {
|
||||
%r = add <8 x i16> %0, %1
|
||||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
|
||||
%r = add i16 %0, %1
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
|
||||
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
|
||||
}
|
||||
|
||||
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
|
||||
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user