diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 259756ad..59bc0880 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -88,6 +88,8 @@ struct __vec4_f { __m128 v; }; +struct __vec4_d; + struct __vec4_i64 { __vec4_i64() { } FORCEINLINE __vec4_i64(__m128i a, __m128i b) { v[0] = a; v[1] = b; } @@ -101,6 +103,8 @@ struct __vec4_i64 { v[0] = _mm_loadu_si128((__m128i *)p); v[1] = _mm_loadu_si128((__m128i *)(p+2)); } + __vec4_i64(__vec4_d); + FORCEINLINE uint64_t &operator[](int i) { return ((uint64_t *)v)[i]; } __m128i v[2]; @@ -155,17 +159,27 @@ struct __vec4_i8 { struct __vec4_d { - __vec4_d() { } + FORCEINLINE __vec4_d() { } FORCEINLINE __vec4_d(__m128d a, __m128d b) { v[0] = a; v[1] = b; } FORCEINLINE __vec4_d(double a, double b, double c, double d) { v[0] = _mm_set_pd(b, a); v[1] = _mm_set_pd(d, c); } + FORCEINLINE __vec4_d(__vec4_i64 v64) { + v[0] = _mm_castsi128_pd(v64.v[0]); + v[1] = _mm_castsi128_pd(v64.v[1]); + } __m128d v[2]; }; +FORCEINLINE __vec4_i64::__vec4_i64(__vec4_d vd) { + v[0] = _mm_castpd_si128(vd.v[0]); + v[1] = _mm_castpd_si128(vd.v[1]); +} + + /////////////////////////////////////////////////////////////////////////// // SSE helpers / utility functions @@ -182,6 +196,8 @@ static FORCEINLINE float bits_as_float(uint32_t v) { return u.f; } +#define _mm_extract_ps_as_float(v, i) bits_as_float(_mm_extract_ps(v, i)) + template static FORCEINLINE T __select(bool test, T a, T b) { return test ? a : b; @@ -244,6 +260,21 @@ static FORCEINLINE __vec4_i1 __or(__vec4_i1 a, __vec4_i1 b) { return _mm_or_ps(a.v, b.v); } +static FORCEINLINE __vec4_i1 __not(__vec4_i1 a) { + __m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1)); + return _mm_xor_ps(a.v, allon); +} + +static FORCEINLINE __vec4_i1 __and_not1(__vec4_i1 a, __vec4_i1 b) { + __m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1)); + return _mm_and_ps(_mm_xor_ps(a.v, allon), b.v); +} + +static FORCEINLINE __vec4_i1 __and_not2(__vec4_i1 a, __vec4_i1 b) { + __m128 allon = _mm_castsi128_ps(_mm_set_epi32(-1, -1, -1, -1)); + return _mm_and_ps(a.v, _mm_xor_ps(b.v, allon)); +} + static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b) { return _mm_blendv_ps(b.v, a.v, mask.v); } @@ -2184,6 +2215,122 @@ static FORCEINLINE __vec4_d __sqrt_varying_double(__vec4_d v) { return __vec4_d(_mm_sqrt_pd(v.v[0]), _mm_sqrt_pd(v.v[1])); } +static FORCEINLINE __vec4_f __pow_varying_float(__vec4_f a, __vec4_f b) { + float r[4]; + for (int i = 0; i < 4; ++i) + r[i] = powf(__extract_element(a, i), __extract_element(b, i)); + return __vec4_f(r); +} + +static FORCEINLINE float __pow_uniform_float(float a, float b) { + return powf(a, b); +} + +static FORCEINLINE __vec4_f __exp_varying_float(__vec4_f a) { + float r[4]; + for (int i = 0; i < 4; ++i) + r[i] = expf(__extract_element(a, i)); + return __vec4_f(r); +} + +static FORCEINLINE float __exp_uniform_float(float a) { + return expf(a); +} + +static FORCEINLINE __vec4_f __log_varying_float(__vec4_f a) { + float r[4]; + for (int i = 0; i < 4; ++i) + r[i] = logf(__extract_element(a, i)); + return __vec4_f(r); +} + +static FORCEINLINE float __log_uniform_float(float a) { + return logf(a); +} + +static FORCEINLINE int __intbits(float v) { + union { + float f; + int i; + } u; + u.f = v; + return u.i; +} + +static FORCEINLINE float __floatbits(int v) { + union { + float f; + int i; + } u; + u.i = v; + return u.f; +} + +static FORCEINLINE float __half_to_float_uniform(int16_t h) { + static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + + int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o; // just the exponent + o += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) // Inf/NaN? + o += (128 - 16) << 23; // extra exp adjust + else if (exp == 0) { // Zero/Denormal? + o += 1 << 23; // extra exp adjust + o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize + } + + o |= ((int32_t)(h & 0x8000)) << 16; // sign bit + return __floatbits(o); +} + + +static FORCEINLINE __vec4_f __half_to_float_varying(__vec4_i16 v) { + float ret[4]; + for (int i = 0; i < 4; ++i) + ret[i] = __half_to_float_uniform(__extract_element(v, i)); + return __vec4_f(ret); +} + + +static FORCEINLINE int16_t __float_to_half_uniform(float f) { + uint32_t sign_mask = 0x80000000u; + int32_t o; + + int32_t fint = __intbits(f); + int32_t sign = fint & sign_mask; + fint ^= sign; + + int32_t f32infty = 255 << 23; + o = (fint > f32infty) ? 0x7e00 : 0x7c00; + + // (De)normalized number or zero + // update fint unconditionally to save the blending; we don't need it + // anymore for the Inf/NaN case anyway. + const uint32_t round_mask = ~0xfffu; + const int32_t magic = 15 << 23; + const int32_t f16infty = 31 << 23; + + int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask; + fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed + + if (fint < f32infty) + o = fint2 >> 13; // Take the bits! + + return (o | (sign >> 16)); +} + + +static FORCEINLINE __vec4_i16 __float_to_half_varying(__vec4_f v) { + uint16_t ret[4]; + for (int i = 0; i < 4; ++i) + ret[i] = __float_to_half_uniform(__extract_element(v, i)); + return __vec4_i16(ret); +} + + + /////////////////////////////////////////////////////////////////////////// // bit ops @@ -2788,114 +2935,57 @@ static FORCEINLINE __vec4_i16 static FORCEINLINE __vec4_i32 __gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - __m128i r = _mm_set_epi32(0, 0, 0, 0); -#if 1 - // "Fast gather"... - offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0)); - constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0)); - - int offset = scale * _mm_extract_epi32(offsets.v, 0) + - _mm_extract_epi32(constOffset.v, 0); - uint32_t *ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 0); - - offset = scale * _mm_extract_epi32(offsets.v, 1) + - _mm_extract_epi32(constOffset.v, 1); - ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 1); - - offset = scale * _mm_extract_epi32(offsets.v, 2) + - _mm_extract_epi32(constOffset.v, 2); - ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 2); - - offset = scale * _mm_extract_epi32(offsets.v, 3) + - _mm_extract_epi32(constOffset.v, 3); - ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 3); -#else - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0) + - _mm_extract_epi32(constOffset.v, 0); - uint32_t *ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1) + - _mm_extract_epi32(constOffset.v, 1); - uint32_t *ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2) + - _mm_extract_epi32(constOffset.v, 2); - uint32_t *ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 2); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3) + - _mm_extract_epi32(constOffset.v, 3); - uint32_t *ptr = (uint32_t *)(p + offset); - r = _mm_insert_epi32(r, *ptr, 3); - } -#endif - return r; + return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, + constOffset, mask); } static FORCEINLINE __vec4_i32 __gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_f __gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_f __gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_i64 __gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_i64 __gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_d __gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, - delta, mask); + constOffset, mask); } static FORCEINLINE __vec4_d __gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 delta, __vec4_i1 mask) { + uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, - delta, mask); + constOffset, mask); } template @@ -3033,11 +3123,11 @@ static FORCEINLINE __vec4_i32 __gather64_i32(__vec4_i64 ptrs, __vec4_i1 mask) { } static FORCEINLINE __vec4_f __gather32_float(__vec4_i32 ptrs, __vec4_i1 mask) { - return __vec4_f(__gather32_i32(ptrs, mask); + return __vec4_f(__gather32_i32(ptrs, mask)); } -static FORCEINLINE __vec4_f __gather64_float(__vec4_i32 ptrs, __vec4_i1 mask) { - return __vec4_f(__gather64_i32(ptrs, mask); +static FORCEINLINE __vec4_f __gather64_float(__vec4_i64 ptrs, __vec4_i1 mask) { + return __vec4_f(__gather64_i32(ptrs, mask)); } static FORCEINLINE __vec4_i64 __gather32_i64(__vec4_i32 ptrs, __vec4_i1 mask) { @@ -3058,11 +3148,11 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { // scatter -#define SCATTER32_64(SUFFIX, TYPE, EXTRACT) \ +#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ uint32_t scale, __vec4_i32 constOffset, \ - __vec4_##SUFFIX val, __vec4_i1 mask) { \ + __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \ @@ -3091,7 +3181,7 @@ __scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ static FORCEINLINE void \ __scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ uint32_t scale, __vec4_i64 constOffset, \ - __vec4_##SUFFIX val, __vec4_i1 mask) { \ + __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \ @@ -3123,10 +3213,10 @@ __scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ } -SCATTER32_64(i8, int8_t, _mm_extract_epi8) -SCATTER32_64(i16, int16_t, _mm_extract_epi16) -SCATTER32_64(i32, int32_t, _mm_extract_epi32) -SCATTER32_64(f, float, _mm_extract_epi32) +SCATTER32_64(i8, i8, int8_t, _mm_extract_epi8) +SCATTER32_64(i16, i16, int16_t, _mm_extract_epi16) +SCATTER32_64(i32, i32, int32_t, _mm_extract_epi32) +SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void