For << and >> with C++, detect when all instances are shifting by the same amount.

In this case, we now emit calls to potentially-specialized functions for the
left/right shifts that take a single integer value for the shift amount.  These
in turn can be matched to the corresponding intrinsics for the SSE target.

Issue #145.
This commit is contained in:
Matt Pharr
2012-01-19 10:04:32 -07:00
parent 3f89295d10
commit 68f6ea8def
6 changed files with 433 additions and 280 deletions

View File

@@ -251,6 +251,14 @@ static FORCEINLINE TYPE __select(bool cond, TYPE a, TYPE b) { \
return cond ? a : b; \
}
#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \
static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
TYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = (CAST)(a.v[i]) OP b; \
return ret; \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
@@ -386,6 +394,10 @@ BINARY_OP_CAST(__vec16_i8, int8_t, __srem, %)
BINARY_OP_CAST(__vec16_i8, uint8_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, uint8_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i8, int8_t, __shl, <<)
CMP_OP(__vec16_i8, int8_t, __equal, ==)
CMP_OP(__vec16_i8, int8_t, __not_equal, !=)
CMP_OP(__vec16_i8, uint8_t, __unsigned_less_equal, <=)
@@ -425,6 +437,10 @@ BINARY_OP_CAST(__vec16_i16, int16_t, __srem, %)
BINARY_OP_CAST(__vec16_i16, uint16_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, uint16_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i16, int16_t, __shl, <<)
CMP_OP(__vec16_i16, int16_t, __equal, ==)
CMP_OP(__vec16_i16, int16_t, __not_equal, !=)
CMP_OP(__vec16_i16, uint16_t, __unsigned_less_equal, <=)
@@ -464,6 +480,10 @@ BINARY_OP_CAST(__vec16_i32, int32_t, __srem, %)
BINARY_OP_CAST(__vec16_i32, uint32_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, uint32_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i32, int32_t, __shl, <<)
CMP_OP(__vec16_i32, int32_t, __equal, ==)
CMP_OP(__vec16_i32, int32_t, __not_equal, !=)
CMP_OP(__vec16_i32, uint32_t, __unsigned_less_equal, <=)
@@ -503,6 +523,10 @@ BINARY_OP_CAST(__vec16_i64, int64_t, __srem, %)
BINARY_OP_CAST(__vec16_i64, uint64_t, __lshr, >>)
BINARY_OP_CAST(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
CMP_OP(__vec16_i64, int64_t, __equal, ==)
CMP_OP(__vec16_i64, int64_t, __not_equal, !=)
CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)

View File

@@ -303,6 +303,13 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, __vec4_i8 b) {
_mm_extract_epi8(a.v, 3) << _mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
return __vec4_i8(_mm_extract_epi8(a.v, 0) << b,
_mm_extract_epi8(a.v, 1) << b,
_mm_extract_epi8(a.v, 2) << b,
_mm_extract_epi8(a.v, 3) << b);
}
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
(uint8_t)_mm_extract_epi8(b.v, 0),
@@ -358,6 +365,13 @@ static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, __vec4_i8 b) {
(uint8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __lshr(__vec4_i8 a, int32_t b) {
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) >> b,
(uint8_t)_mm_extract_epi8(a.v, 1) >> b,
(uint8_t)_mm_extract_epi8(a.v, 2) >> b,
(uint8_t)_mm_extract_epi8(a.v, 3) >> b);
}
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >>
(int8_t)_mm_extract_epi8(b.v, 0),
@@ -369,6 +383,13 @@ static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, __vec4_i8 b) {
(int8_t)_mm_extract_epi8(b.v, 3));
}
static FORCEINLINE __vec4_i8 __ashr(__vec4_i8 a, int32_t b) {
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) >> b,
(int8_t)_mm_extract_epi8(a.v, 1) >> b,
(int8_t)_mm_extract_epi8(a.v, 2) >> b,
(int8_t)_mm_extract_epi8(a.v, 3) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i8 a, __vec4_i8 b) {
__m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
return __vec4_i1(_mm_extract_epi8(cmp, 0),
@@ -547,6 +568,10 @@ static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, __vec4_i16 b) {
_mm_extract_epi16(a.v, 3) << _mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __shl(__vec4_i16 a, int32_t b) {
return _mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i16 __udiv(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i16((uint16_t)_mm_extract_epi16(a.v, 0) /
(uint16_t)_mm_extract_epi16(b.v, 0),
@@ -602,6 +627,10 @@ static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, __vec4_i16 b) {
(uint16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __lshr(__vec4_i16 a, int32_t b) {
return _mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
return __vec4_i16((int16_t)_mm_extract_epi16(a.v, 0) >>
(int16_t)_mm_extract_epi16(b.v, 0),
@@ -613,6 +642,10 @@ static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, __vec4_i16 b) {
(int16_t)_mm_extract_epi16(b.v, 3));
}
static FORCEINLINE __vec4_i16 __ashr(__vec4_i16 a, int32_t b) {
return _mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i16 a, __vec4_i16 b) {
__m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
return __vec4_i1(_mm_extract_epi16(cmp, 0),
@@ -789,9 +822,6 @@ static FORCEINLINE __vec4_i32 __xor(__vec4_i32 a, __vec4_i32 b) {
}
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, then we can use _mm_sll_epi32.
/* fixme: llvm generates thie code for shift left, which is presumably
more efficient than doing each component individually as below.
@@ -813,57 +843,92 @@ _f___ii: ## @f___ii
ret
*/
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) << _mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) << _mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) << _mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) << _mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) <<
_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) <<
_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) <<
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
return _mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) / (uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) / (uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) / (uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) / (uint32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
(uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) /
(uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) /
(uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) /
(uint32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) / (int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) / (int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) / (int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) / (int32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
(int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) /
(int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) /
(int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) /
(int32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __urem(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) % (uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) % (uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) % (uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) % (uint32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) %
(uint32_t)_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) %
(uint32_t)_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) %
(uint32_t)_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) %
(uint32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __srem(__vec4_i32 a, __vec4_i32 b) {
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) % (int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) % (int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) % (int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) % (int32_t)_mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) %
(int32_t)_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) %
(int32_t)_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) %
(int32_t)_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) %
(int32_t)_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, e.g. using gcc's __builtin_constant_p, then we
// can use _mm_srl_epi32.
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) >>
_mm_extract_epi32(b.v, 0),
(uint32_t)_mm_extract_epi32(a.v, 1) >>
_mm_extract_epi32(b.v, 1),
(uint32_t)_mm_extract_epi32(a.v, 2) >>
_mm_extract_epi32(b.v, 2),
(uint32_t)_mm_extract_epi32(a.v, 3) >>
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __lshr(__vec4_i32 a, int32_t b) {
return _mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, __vec4_i32 b) {
// FIXME: if we can determine at compile time that b has the same value
// across all elements, then we can use _mm_sra_epi32.
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >> _mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) >> _mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) >> _mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) >> _mm_extract_epi32(b.v, 3));
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) >>
_mm_extract_epi32(b.v, 0),
(int32_t)_mm_extract_epi32(a.v, 1) >>
_mm_extract_epi32(b.v, 1),
(int32_t)_mm_extract_epi32(a.v, 2) >>
_mm_extract_epi32(b.v, 2),
(int32_t)_mm_extract_epi32(a.v, 3) >>
_mm_extract_epi32(b.v, 3));
}
static FORCEINLINE __vec4_i32 __ashr(__vec4_i32 a, int32_t b) {
return _mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, b));
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i32 a, __vec4_i32 b) {
@@ -1016,6 +1081,12 @@ static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, __vec4_i64 b) {
_mm_extract_epi64(a.v[1], 1) << _mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __shl(__vec4_i64 a, int32_t b) {
__m128i amt = _mm_set_epi32(0, 0, 0, b);
return __vec4_i64(_mm_sll_epi64(a.v[0], amt),
_mm_sll_epi64(a.v[1], amt));
}
static FORCEINLINE __vec4_i64 __udiv(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i64((uint64_t)_mm_extract_epi64(a.v[0], 0) /
(uint64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1071,6 +1142,12 @@ static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, __vec4_i64 b) {
(uint64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __lshr(__vec4_i64 a, int32_t b) {
__m128i amt = _mm_set_epi32(0, 0, 0, b);
return __vec4_i64(_mm_srl_epi64(a.v[0], amt),
_mm_srl_epi64(a.v[1], amt));
}
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >>
(int64_t)_mm_extract_epi64(b.v[0], 0),
@@ -1082,6 +1159,13 @@ static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, __vec4_i64 b) {
(int64_t)_mm_extract_epi64(b.v[1], 1));
}
static FORCEINLINE __vec4_i64 __ashr(__vec4_i64 a, int32_t b) {
return __vec4_i64((int64_t)_mm_extract_epi64(a.v[0], 0) >> b,
(int64_t)_mm_extract_epi64(a.v[0], 1) >> b,
(int64_t)_mm_extract_epi64(a.v[1], 0) >> b,
(int64_t)_mm_extract_epi64(a.v[1], 1) >> b);
}
static FORCEINLINE __vec4_i1 __equal(__vec4_i64 a, __vec4_i64 b) {
__m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
__m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);