Fix a number of issues in examples/intrinsics/sse4.h.

This had gotten fairly out of date, after recent changes to C++ output.
Roughly 15 tests still fail with this target.

Issue #278.
This commit is contained in:
Matt Pharr
2012-06-08 12:52:36 -07:00
parent e730a5364b
commit 27e39954d6

View File

@@ -88,6 +88,8 @@ struct __vec4_f {
__m128 v;
};
struct __vec4_d;
struct __vec4_i64 {
__vec4_i64() { }
FORCEINLINE __vec4_i64(__m128i a, __m128i b) { v[0] = a; v[1] = b; }
@@ -101,6 +103,8 @@ struct __vec4_i64 {
v[0] = _mm_loadu_si128((__m128i *)p);
v[1] = _mm_loadu_si128((__m128i *)(p+2));
}
__vec4_i64(__vec4_d);
FORCEINLINE uint64_t &operator[](int i) { return ((uint64_t *)v)[i]; }
__m128i v[2];
@@ -155,17 +159,27 @@ struct __vec4_i8 {
struct __vec4_d {
__vec4_d() { }
FORCEINLINE __vec4_d() { }
FORCEINLINE __vec4_d(__m128d a, __m128d b) { v[0] = a; v[1] = b; }
FORCEINLINE __vec4_d(double a, double b, double c, double d) {
v[0] = _mm_set_pd(b, a);
v[1] = _mm_set_pd(d, c);
}
FORCEINLINE __vec4_d(__vec4_i64 v64) {
v[0] = _mm_castsi128_pd(v64.v[0]);
v[1] = _mm_castsi128_pd(v64.v[1]);
}
__m128d v[2];
};
FORCEINLINE __vec4_i64::__vec4_i64(__vec4_d vd) {
v[0] = _mm_castpd_si128(vd.v[0]);
v[1] = _mm_castpd_si128(vd.v[1]);
}
///////////////////////////////////////////////////////////////////////////
// SSE helpers / utility functions
@@ -182,6 +196,8 @@ static FORCEINLINE float bits_as_float(uint32_t v) {
return u.f;
}
#define _mm_extract_ps_as_float(v, i) bits_as_float(_mm_extract_ps(v, i))
template <typename T>
static FORCEINLINE T __select(bool test, T a, T b) {
return test ? a : b;
@@ -244,6 +260,21 @@ static FORCEINLINE __vec4_i1 __or(__vec4_i1 a, __vec4_i1 b) {
return _mm_or_ps(a.v, b.v);
}
static FORCEINLINE __vec4_i1 __not(__vec4_i1 a) {
__m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1));
return _mm_xor_ps(a.v, allon);
}
static FORCEINLINE __vec4_i1 __and_not1(__vec4_i1 a, __vec4_i1 b) {
__m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1));
return _mm_and_ps(_mm_xor_ps(a.v, allon), b.v);
}
static FORCEINLINE __vec4_i1 __and_not2(__vec4_i1 a, __vec4_i1 b) {
__m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1));
return _mm_and_ps(a.v, _mm_xor_ps(b.v, allon));
}
static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b) {
return _mm_blendv_ps(b.v, a.v, mask.v);
}
@@ -2184,6 +2215,122 @@ static FORCEINLINE __vec4_d __sqrt_varying_double(__vec4_d v) {
return __vec4_d(_mm_sqrt_pd(v.v[0]), _mm_sqrt_pd(v.v[1]));
}
static FORCEINLINE __vec4_f __pow_varying_float(__vec4_f a, __vec4_f b) {
float r[4];
for (int i = 0; i < 4; ++i)
r[i] = powf(__extract_element(a, i), __extract_element(b, i));
return __vec4_f(r);
}
static FORCEINLINE float __pow_uniform_float(float a, float b) {
return powf(a, b);
}
static FORCEINLINE __vec4_f __exp_varying_float(__vec4_f a) {
float r[4];
for (int i = 0; i < 4; ++i)
r[i] = expf(__extract_element(a, i));
return __vec4_f(r);
}
static FORCEINLINE float __exp_uniform_float(float a) {
return expf(a);
}
static FORCEINLINE __vec4_f __log_varying_float(__vec4_f a) {
float r[4];
for (int i = 0; i < 4; ++i)
r[i] = logf(__extract_element(a, i));
return __vec4_f(r);
}
static FORCEINLINE float __log_uniform_float(float a) {
return logf(a);
}
static FORCEINLINE int __intbits(float v) {
union {
float f;
int i;
} u;
u.f = v;
return u.i;
}
static FORCEINLINE float __floatbits(int v) {
union {
float f;
int i;
} u;
u.i = v;
return u.f;
}
static FORCEINLINE float __half_to_float_uniform(int16_t h) {
static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits
uint32_t exp = shifted_exp & o; // just the exponent
o += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) // Inf/NaN?
o += (128 - 16) << 23; // extra exp adjust
else if (exp == 0) { // Zero/Denormal?
o += 1 << 23; // extra exp adjust
o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
}
o |= ((int32_t)(h & 0x8000)) << 16; // sign bit
return __floatbits(o);
}
static FORCEINLINE __vec4_f __half_to_float_varying(__vec4_i16 v) {
float ret[4];
for (int i = 0; i < 4; ++i)
ret[i] = __half_to_float_uniform(__extract_element(v, i));
return __vec4_f(ret);
}
static FORCEINLINE int16_t __float_to_half_uniform(float f) {
uint32_t sign_mask = 0x80000000u;
int32_t o;
int32_t fint = __intbits(f);
int32_t sign = fint & sign_mask;
fint ^= sign;
int32_t f32infty = 255 << 23;
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
// (De)normalized number or zero
// update fint unconditionally to save the blending; we don't need it
// anymore for the Inf/NaN case anyway.
const uint32_t round_mask = ~0xfffu;
const int32_t magic = 15 << 23;
const int32_t f16infty = 31 << 23;
int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
if (fint < f32infty)
o = fint2 >> 13; // Take the bits!
return (o | (sign >> 16));
}
static FORCEINLINE __vec4_i16 __float_to_half_varying(__vec4_f v) {
uint16_t ret[4];
for (int i = 0; i < 4; ++i)
ret[i] = __float_to_half_uniform(__extract_element(v, i));
return __vec4_i16(ret);
}
///////////////////////////////////////////////////////////////////////////
// bit ops
@@ -2788,114 +2935,57 @@ static FORCEINLINE __vec4_i16
static FORCEINLINE __vec4_i32
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
__m128i r = _mm_set_epi32(0, 0, 0, 0);
#if 1
// "Fast gather"...
offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 2);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint32_t *ptr = (uint32_t *)(p + offset);
r = _mm_insert_epi32(r, *ptr, 3);
}
#endif
return r;
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale,
constOffset, mask);
}
static FORCEINLINE __vec4_i32
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_f
__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_f
__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_d
__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
static FORCEINLINE __vec4_d
__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) {
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
delta, mask);
constOffset, mask);
}
template<typename RetVec, typename RetScalar>
@@ -3033,11 +3123,11 @@ static FORCEINLINE __vec4_i32 __gather64_i32(__vec4_i64 ptrs, __vec4_i1 mask) {
}
static FORCEINLINE __vec4_f __gather32_float(__vec4_i32 ptrs, __vec4_i1 mask) {
return __vec4_f(__gather32_i32(ptrs, mask);
return __vec4_f(__gather32_i32(ptrs, mask));
}
static FORCEINLINE __vec4_f __gather64_float(__vec4_i32 ptrs, __vec4_i1 mask) {
return __vec4_f(__gather64_i32(ptrs, mask);
static FORCEINLINE __vec4_f __gather64_float(__vec4_i64 ptrs, __vec4_i1 mask) {
return __vec4_f(__gather64_i32(ptrs, mask));
}
static FORCEINLINE __vec4_i64 __gather32_i64(__vec4_i32 ptrs, __vec4_i1 mask) {
@@ -3058,11 +3148,11 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
// scatter
#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
static FORCEINLINE void \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
uint32_t scale, __vec4_i32 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
@@ -3091,7 +3181,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
uint32_t scale, __vec4_i64 constOffset, \
__vec4_##SUFFIX val, __vec4_i1 mask) { \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
@@ -3123,10 +3213,10 @@ __scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
}
SCATTER32_64(i8, int8_t, _mm_extract_epi8)
SCATTER32_64(i16, int16_t, _mm_extract_epi16)
SCATTER32_64(i32, int32_t, _mm_extract_epi32)
SCATTER32_64(f, float, _mm_extract_epi32)
SCATTER32_64(i8, i8, int8_t, _mm_extract_epi8)
SCATTER32_64(i16, i16, int16_t, _mm_extract_epi16)
SCATTER32_64(i32, i32, int32_t, _mm_extract_epi32)
SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
static FORCEINLINE void