Merge pull request #321 from mmp/setzero

More varied support for constant vectors from C++ backend.
This commit is contained in:
Jean-Luc Duprat
2012-07-09 08:57:05 -07:00
7 changed files with 330 additions and 104 deletions

View File

@@ -262,13 +262,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = v; \
return ret; \
}
#define SETZERO(VTYPE, NAME) \
static FORCEINLINE VTYPE __setzero_##NAME() { \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = 0; \
return ret; \
}
#define UNDEF(VTYPE, NAME) \
static FORCEINLINE VTYPE __undef_##NAME() { \
return VTYPE(); \
}
#define BROADCAST(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \
VTYPE ret; \
@@ -394,11 +407,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
*ptr = v.v;
}
static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int v) {
static FORCEINLINE __vec16_i1 __smear_i1(int v) {
return __vec16_i1(v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v);
}
static FORCEINLINE __vec16_i1 __setzero_i1() {
return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
}
static FORCEINLINE __vec16_i1 __undef_i1() {
return __vec16_i1();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -438,6 +461,8 @@ CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec16_i8)
INSERT_EXTRACT(__vec16_i8, int8_t)
SMEAR(__vec16_i8, i8, int8_t)
SETZERO(__vec16_i8, i8)
UNDEF(__vec16_i8, i8)
BROADCAST(__vec16_i8, i8, int8_t)
ROTATE(__vec16_i8, i8, int8_t)
SHUFFLES(__vec16_i8, i8, int8_t)
@@ -481,6 +506,8 @@ CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec16_i16)
INSERT_EXTRACT(__vec16_i16, int16_t)
SMEAR(__vec16_i16, i16, int16_t)
SETZERO(__vec16_i16, i16)
UNDEF(__vec16_i16, i16)
BROADCAST(__vec16_i16, i16, int16_t)
ROTATE(__vec16_i16, i16, int16_t)
SHUFFLES(__vec16_i16, i16, int16_t)
@@ -524,6 +551,8 @@ CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec16_i32)
INSERT_EXTRACT(__vec16_i32, int32_t)
SMEAR(__vec16_i32, i32, int32_t)
SETZERO(__vec16_i32, i32)
UNDEF(__vec16_i32, i32)
BROADCAST(__vec16_i32, i32, int32_t)
ROTATE(__vec16_i32, i32, int32_t)
SHUFFLES(__vec16_i32, i32, int32_t)
@@ -567,6 +596,8 @@ CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec16_i64)
INSERT_EXTRACT(__vec16_i64, int64_t)
SMEAR(__vec16_i64, i64, int64_t)
SETZERO(__vec16_i64, i64)
UNDEF(__vec16_i64, i64)
BROADCAST(__vec16_i64, i64, int64_t)
ROTATE(__vec16_i64, i64, int64_t)
SHUFFLES(__vec16_i64, i64, int64_t)
@@ -602,6 +633,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
SELECT(__vec16_f)
INSERT_EXTRACT(__vec16_f, float)
SMEAR(__vec16_f, float, float)
SETZERO(__vec16_f, float)
UNDEF(__vec16_f, float)
BROADCAST(__vec16_f, float, float)
ROTATE(__vec16_f, float, float)
SHUFFLES(__vec16_f, float, float)
@@ -752,6 +785,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
SELECT(__vec16_d)
INSERT_EXTRACT(__vec16_d, double)
SMEAR(__vec16_d, double, double)
SETZERO(__vec16_d, double)
UNDEF(__vec16_d, double)
BROADCAST(__vec16_d, double, double)
ROTATE(__vec16_d, double, double)
SHUFFLES(__vec16_d, double, double)

View File

@@ -327,13 +327,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
for (int i = 0; i < 32; ++i) \
ret.v[i] = v; \
return ret; \
}
#define SETZERO(VTYPE, NAME) \
static FORCEINLINE VTYPE __setzero_##NAME() { \
VTYPE ret; \
for (int i = 0; i < 32; ++i) \
ret.v[i] = 0; \
return ret; \
}
#define UNDEF(VTYPE, NAME) \
static FORCEINLINE VTYPE __undef_##NAME() { \
return VTYPE(); \
}
#define BROADCAST(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \
VTYPE ret; \
@@ -459,13 +472,24 @@ template <int ALIGN> static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v
*ptr = v.v;
}
static FORCEINLINE __vec32_i1 __smear_i1(__vec32_i1, int v) {
static FORCEINLINE __vec32_i1 __smear_i1(int v) {
return __vec32_i1(v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v);
}
static FORCEINLINE __vec32_i1 __setzero_i1() {
return __vec32_i1(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
}
static FORCEINLINE __vec32_i1 __undef_i1() {
return __vec32_i1();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -505,6 +529,8 @@ CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec32_i8)
INSERT_EXTRACT(__vec32_i8, int8_t)
SMEAR(__vec32_i8, i8, int8_t)
SETZERO(__vec32_i8, i8)
UNDEF(__vec32_i8, i8)
BROADCAST(__vec32_i8, i8, int8_t)
ROTATE(__vec32_i8, i8, int8_t)
SHUFFLES(__vec32_i8, i8, int8_t)
@@ -548,6 +574,8 @@ CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec32_i16)
INSERT_EXTRACT(__vec32_i16, int16_t)
SMEAR(__vec32_i16, i16, int16_t)
SETZERO(__vec32_i16, i16)
UNDEF(__vec32_i16, i16)
BROADCAST(__vec32_i16, i16, int16_t)
ROTATE(__vec32_i16, i16, int16_t)
SHUFFLES(__vec32_i16, i16, int16_t)
@@ -591,6 +619,8 @@ CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec32_i32)
INSERT_EXTRACT(__vec32_i32, int32_t)
SMEAR(__vec32_i32, i32, int32_t)
SETZERO(__vec32_i32, i32)
UNDEF(__vec32_i32, i32)
BROADCAST(__vec32_i32, i32, int32_t)
ROTATE(__vec32_i32, i32, int32_t)
SHUFFLES(__vec32_i32, i32, int32_t)
@@ -634,6 +664,8 @@ CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec32_i64)
INSERT_EXTRACT(__vec32_i64, int64_t)
SMEAR(__vec32_i64, i64, int64_t)
SETZERO(__vec32_i64, i64)
UNDEF(__vec32_i64, i64)
BROADCAST(__vec32_i64, i64, int64_t)
ROTATE(__vec32_i64, i64, int64_t)
SHUFFLES(__vec32_i64, i64, int64_t)
@@ -669,6 +701,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
SELECT(__vec32_f)
INSERT_EXTRACT(__vec32_f, float)
SMEAR(__vec32_f, float, float)
SETZERO(__vec32_f, float)
UNDEF(__vec32_f, float)
BROADCAST(__vec32_f, float, float)
ROTATE(__vec32_f, float, float)
SHUFFLES(__vec32_f, float, float)
@@ -819,6 +853,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
SELECT(__vec32_d)
INSERT_EXTRACT(__vec32_d, double)
SMEAR(__vec32_d, double, double)
SETZERO(__vec32_d, double)
UNDEF(__vec32_d, double)
BROADCAST(__vec32_d, double, double)
ROTATE(__vec32_d, double, double)
SHUFFLES(__vec32_d, double, double)

View File

@@ -452,13 +452,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \
}
#define SMEAR(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \
static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \
VTYPE ret; \
for (int i = 0; i < 64; ++i) \
ret.v[i] = v; \
return ret; \
}
#define SETZERO(VTYPE, NAME) \
static FORCEINLINE VTYPE __setzero_##NAME() { \
VTYPE ret; \
for (int i = 0; i < 64; ++i) \
ret.v[i] = 0; \
return ret; \
}
#define UNDEF(VTYPE, NAME) \
static FORCEINLINE VTYPE __undef_##NAME(VTYPE retType) { \
return VTYPE(); \
}
#define BROADCAST(VTYPE, NAME, STYPE) \
static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \
VTYPE ret; \
@@ -584,7 +597,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v
*ptr = v.v;
}
static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) {
static FORCEINLINE __vec64_i1 __smear_i1(int v) {
return __vec64_i1(v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v,
v, v, v, v, v, v, v, v,
@@ -595,6 +608,21 @@ static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) {
v, v, v, v, v, v, v, v);
}
static FORCEINLINE __vec64_i1 __setzero_i1() {
return __vec64_i1(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
}
static FORCEINLINE __vec64_i1 __undef_i1() {
return __vec64_i1();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -634,6 +662,8 @@ CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_than, >)
SELECT(__vec64_i8)
INSERT_EXTRACT(__vec64_i8, int8_t)
SMEAR(__vec64_i8, i8, int8_t)
SETZERO(__vec64_i8, i8)
UNDEF(__vec64_i8, i8)
BROADCAST(__vec64_i8, i8, int8_t)
ROTATE(__vec64_i8, i8, int8_t)
SHUFFLES(__vec64_i8, i8, int8_t)
@@ -677,6 +707,8 @@ CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_than, >)
SELECT(__vec64_i16)
INSERT_EXTRACT(__vec64_i16, int16_t)
SMEAR(__vec64_i16, i16, int16_t)
SETZERO(__vec64_i16, i16)
UNDEF(__vec64_i16, i16)
BROADCAST(__vec64_i16, i16, int16_t)
ROTATE(__vec64_i16, i16, int16_t)
SHUFFLES(__vec64_i16, i16, int16_t)
@@ -720,6 +752,8 @@ CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_than, >)
SELECT(__vec64_i32)
INSERT_EXTRACT(__vec64_i32, int32_t)
SMEAR(__vec64_i32, i32, int32_t)
SETZERO(__vec64_i32, i32)
UNDEF(__vec64_i32, i32)
BROADCAST(__vec64_i32, i32, int32_t)
ROTATE(__vec64_i32, i32, int32_t)
SHUFFLES(__vec64_i32, i32, int32_t)
@@ -763,6 +797,8 @@ CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_than, >)
SELECT(__vec64_i64)
INSERT_EXTRACT(__vec64_i64, int64_t)
SMEAR(__vec64_i64, i64, int64_t)
SETZERO(__vec64_i64, i64)
UNDEF(__vec64_i64, i64)
BROADCAST(__vec64_i64, i64, int64_t)
ROTATE(__vec64_i64, i64, int64_t)
SHUFFLES(__vec64_i64, i64, int64_t)
@@ -798,6 +834,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
SELECT(__vec64_f)
INSERT_EXTRACT(__vec64_f, float)
SMEAR(__vec64_f, float, float)
SETZERO(__vec64_f, float)
UNDEF(__vec64_f, float)
BROADCAST(__vec64_f, float, float)
ROTATE(__vec64_f, float, float)
SHUFFLES(__vec64_f, float, float)
@@ -948,6 +986,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
SELECT(__vec64_d)
INSERT_EXTRACT(__vec64_d, double)
SMEAR(__vec64_d, double, double)
SETZERO(__vec64_d, double)
UNDEF(__vec64_d, double)
BROADCAST(__vec64_d, double, double)
ROTATE(__vec64_d, double, double)
SHUFFLES(__vec64_d, double, double)

View File

@@ -477,10 +477,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
*ptr = v.m;
}
static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int i) {
static FORCEINLINE __vec16_i1 __smear_i1(int i) {
return i?0xFFFF:0x0;
}
static FORCEINLINE __vec16_i1 __setzero_i1() {
return 0;
}
static FORCEINLINE __vec16_i1 __undef_i1() {
return __vec16_i1(); // FIXME? __mm512_undef_mask();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -686,10 +694,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b)
static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; }
static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }
static FORCEINLINE __vec16_i32 __smear_i32(__vec16_i32, int32_t i) {
static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) {
return _mm512_set_1to16_epi32(i);
}
static FORCEINLINE __vec16_i32 __setzero_i32() {
return _mm512_setzero_epi32();
}
static FORCEINLINE __vec16_i32 __undef_i32() {
return _mm512_undefined_epi32();
}
static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
int32_t val = __extract_element(v, index & 0xf);
return _mm512_set_1to16_epi32(val);
@@ -966,10 +982,18 @@ static FORCEINLINE void __insert_element(__vec16_f *v, int index, float val) {
((float *)v)[index] = val;
}
static FORCEINLINE __vec16_f __smear_float(__vec16_f, float f) {
static FORCEINLINE __vec16_f __smear_float(float f) {
return _mm512_set_1to16_ps(f);
}
static FORCEINLINE __vec16_f __setzero_float() {
return _mm512_setzero_ps();
}
static FORCEINLINE __vec16_f __undef_float() {
return _mm512_undefined_ps();
}
static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
int32_t val = __extract_element(v, index & 0xf);
return _mm512_set_1to16_ps(val);
@@ -1116,13 +1140,27 @@ static FORCEINLINE void __insert_element(__vec16_d *v, int index, double val) {
((double *)v)[index] = val;
}
static FORCEINLINE __vec16_d __smear_double(__vec16_d, double d) {
static FORCEINLINE __vec16_d __smear_double(double d) {
__vec16_d ret;
ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
ret.v2 = ret.v1;
return ret;
}
static FORCEINLINE __vec16_d __setzero_double() {
__vec16_d ret;
ret.v1 = _mm512_setzero_pd();
ret.v2 = ret.v1;
return ret;
}
static FORCEINLINE __vec16_d __undef_double() {
__vec16_d ret;
ret.v1 = _mm512_undefined_pd();
ret.v2 = ret.v1;
return ret;
}
static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
__vec16_d ret;
int32_t val = __extract_element(v, index & 0xf);

View File

@@ -297,10 +297,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 val
_mm_storeu_ps((float *)(&p->v), value.v);
}
static FORCEINLINE __vec4_i1 __smear_i1(__vec4_i1, int v) {
static FORCEINLINE __vec4_i1 __smear_i1(int v) {
return __vec4_i1(v, v, v, v);
}
static FORCEINLINE __vec4_i1 __setzero_i1() {
return __vec4_i1(_mm_setzero_ps());
}
static FORCEINLINE __vec4_i1 __undef_i1() {
return __vec4_i1();
}
///////////////////////////////////////////////////////////////////////////
// int8
@@ -525,10 +533,18 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
((int8_t *)v)[index] = val;
}
static FORCEINLINE __vec4_i8 __smear_i8(__vec4_i8, int8_t v) {
static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
return _mm_set1_epi8(v);
}
static FORCEINLINE __vec4_i8 __setzero_i8() {
return _mm_set1_epi8(0);
}
static FORCEINLINE __vec4_i8 __undef_i8() {
return __vec4_i8();
}
static FORCEINLINE __vec4_i8 __broadcast_i8(__vec4_i8 v, int index) {
return _mm_set1_epi8(__extract_element(v, index));
}
@@ -784,10 +800,18 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val)
((int16_t *)v)[index] = val;
}
static FORCEINLINE __vec4_i16 __smear_i16(__vec4_i16, int16_t v) {
static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
return _mm_set1_epi16(v);
}
static FORCEINLINE __vec4_i16 __setzero_i16() {
return _mm_set1_epi16(0);
}
static FORCEINLINE __vec4_i16 __undef_i16() {
return __vec4_i16();
}
static FORCEINLINE __vec4_i16 __broadcast_i16(__vec4_i16 v, int index) {
return _mm_set1_epi16(__extract_element(v, index));
}
@@ -1021,10 +1045,18 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32
_mm_castsi128_ps(a.v), mask.v));
}
static FORCEINLINE __vec4_i32 __smear_i32(__vec4_i32, int32_t v) {
static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
return _mm_set1_epi32(v);
}
static FORCEINLINE __vec4_i32 __setzero_i32() {
return _mm_castps_si128(_mm_setzero_ps());
}
static FORCEINLINE __vec4_i32 __undef_i32() {
return __vec4_i32();
}
static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
return ((int32_t *)&v)[index];
}
@@ -1282,10 +1314,18 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64
return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
}
static FORCEINLINE __vec4_i64 __smear_i64(__vec4_i64, int64_t v) {
static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
return __vec4_i64(v, v, v, v);
}
static FORCEINLINE __vec4_i64 __setzero_i64() {
return __vec4_i64(0, 0, 0, 0);
}
static FORCEINLINE __vec4_i64 __undef_i64() {
return __vec4_i64();
}
static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
return ((int64_t *)&v)[index];
}
@@ -1386,10 +1426,18 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
return _mm_blendv_ps(b.v, a.v, mask.v);
}
static FORCEINLINE __vec4_f __smear_float(__vec4_f, float v) {
static FORCEINLINE __vec4_f __smear_float(float v) {
return _mm_set1_ps(v);
}
static FORCEINLINE __vec4_f __setzero_float() {
return _mm_setzero_ps();
}
static FORCEINLINE __vec4_f __undef_float() {
return __vec4_f();
}
static FORCEINLINE float __extract_element(__vec4_f v, int index) {
return ((float *)&v)[index];
}
@@ -1518,10 +1566,18 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
return __vec4_d(r0, r1);
}
static FORCEINLINE __vec4_d __smear_double(__vec4_d, double v) {
static FORCEINLINE __vec4_d __smear_double(double v) {
return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
}
static FORCEINLINE __vec4_d __setzero_double() {
return __vec4_d(_mm_setzero_pd(), _mm_setzero_pd());
}
static FORCEINLINE __vec4_d __undef_double() {
return __vec4_d();
}
static FORCEINLINE double __extract_element(__vec4_d v, int index) {
return ((double *)&v)[index];
}
@@ -1618,13 +1674,11 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
}
static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
return __select(v, __smear_i8(__vec4_i8(), 0xff),
__smear_i8(__vec4_i8(), 0));
return __select(v, __smear_i8(0xff), __setzero_i8());
}
static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
return __select(v, __smear_i16(__vec4_i16(), 0xffff),
__smear_i16(__vec4_i16(), 0));
return __select(v, __smear_i16(0xffff), __setzero_i16());
}
static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
@@ -1684,12 +1738,11 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
}
static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
return __select(v, __smear_i8(__vec4_i8(), 1), __smear_i8(__vec4_i8(), 0));
return __select(v, __smear_i8(1), __setzero_i8());
}
static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
return __select(v, __smear_i16(__vec4_i16(), 1),
__smear_i16(__vec4_i16(), 0));
return __select(v, __smear_i16(1), __setzero_i16());
}
static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
@@ -1697,7 +1750,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
}
static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
return __select(v, __smear_i64(__vec4_i64(), 1), __smear_i64(__vec4_i64(), 0));
return __select(v, __smear_i64(1), __setzero_i64());
}
// truncations
@@ -1857,11 +1910,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
}
static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
return __select(v, __smear_float(__vec4_f(), 1.), __smear_float(__vec4_f(), 0.));
return __select(v, __smear_float(1.), __setzero_float());
}
static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
return __select(v, __smear_double(__vec4_d(), 1.), __smear_double(__vec4_d(), 0.));
return __select(v, __smear_double(1.), __setzero_double());
}
// float/double to signed int
@@ -2796,8 +2849,8 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));
offsets = __select(mask, offsets, __setzero_i32());
constOffset = __select(mask, constOffset, __setzero_i32());
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2854,8 +2907,8 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __smear_i64(__vec4_i64(), 0));
constOffset = __select(mask, constOffset, __smear_i64(__vec4_i64(), 0));
offsets = __select(mask, offsets, __setzero_i64());
constOffset = __select(mask, constOffset, __setzero_i64());
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);