Add reduce_add() for int8 and int16 types.

This maps to specialized instructions (e.g. PSADBW) when available.
This commit is contained in:
Matt Pharr
2013-07-25 09:11:39 -07:00
parent 2d063925a1
commit b6df447b55
24 changed files with 464 additions and 44 deletions

View File

@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
// reductions
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
// TODO: improve this!
int16_t ret = 0;
for (int i = 0; i < 16; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
// TODO: improve this!
int32_t ret = 0;
for (int i = 0; i < 16; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
return _mm512_reduce_add_epi32(v);
}