Removing some sources of warnings sse4.h and trailing spaces
This commit is contained in:
@@ -28,7 +28,7 @@
|
|||||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@@ -63,7 +63,7 @@ struct __vec4_i1 {
|
|||||||
__vec4_i1(__m128 vv) : v(vv) { }
|
__vec4_i1(__m128 vv) : v(vv) { }
|
||||||
FORCEINLINE __vec4_i1(__m128i vv) : v(_mm_castsi128_ps(vv)) { }
|
FORCEINLINE __vec4_i1(__m128i vv) : v(_mm_castsi128_ps(vv)) { }
|
||||||
FORCEINLINE __vec4_i1(int a, int b, int c, int d) {
|
FORCEINLINE __vec4_i1(int a, int b, int c, int d) {
|
||||||
v = _mm_castsi128_ps(_mm_set_epi32(d ? -1 : 0, c ? -1 : 0,
|
v = _mm_castsi128_ps(_mm_set_epi32(d ? -1 : 0, c ? -1 : 0,
|
||||||
b ? -1 : 0, a ? -1 : 0));
|
b ? -1 : 0, a ? -1 : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,9 +91,9 @@ struct __vec4_i64 {
|
|||||||
__vec4_i64() { }
|
__vec4_i64() { }
|
||||||
FORCEINLINE __vec4_i64(__m128i a, __m128i b) { v[0] = a; v[1] = b; }
|
FORCEINLINE __vec4_i64(__m128i a, __m128i b) { v[0] = a; v[1] = b; }
|
||||||
FORCEINLINE __vec4_i64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
|
FORCEINLINE __vec4_i64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
|
||||||
v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
|
v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
|
||||||
(a >> 32) & 0xffffffff, a & 0xffffffff);
|
(a >> 32) & 0xffffffff, a & 0xffffffff);
|
||||||
v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
|
v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
|
||||||
(c >> 32) & 0xffffffff, c & 0xffffffff);
|
(c >> 32) & 0xffffffff, c & 0xffffffff);
|
||||||
}
|
}
|
||||||
FORCEINLINE __vec4_i64(uint64_t *p) {
|
FORCEINLINE __vec4_i64(uint64_t *p) {
|
||||||
@@ -144,10 +144,10 @@ struct __vec4_i8 {
|
|||||||
FORCEINLINE __vec4_i8(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
|
FORCEINLINE __vec4_i8(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
|
||||||
v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, d, c, b, a);
|
0, 0, 0, 0, d, c, b, a);
|
||||||
|
|
||||||
}
|
}
|
||||||
FORCEINLINE __vec4_i8(uint8_t *p) {
|
FORCEINLINE __vec4_i8(uint8_t *p) {
|
||||||
v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
0, 0, 0, 0, p[3], p[2], p[1], p[0]);
|
0, 0, 0, 0, p[3], p[2], p[1], p[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,31 +391,31 @@ static FORCEINLINE __vec4_i8 __shl(__vec4_i8 a, int32_t b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __udiv(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
|
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) /
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) /
|
(uint8_t)_mm_extract_epi8(a.v, 1) /
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 1),
|
(uint8_t)_mm_extract_epi8(b.v, 1),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 2) /
|
(uint8_t)_mm_extract_epi8(a.v, 2) /
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 2),
|
(uint8_t)_mm_extract_epi8(b.v, 2),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 3) /
|
(uint8_t)_mm_extract_epi8(a.v, 3) /
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __sdiv(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __sdiv(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) /
|
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) /
|
||||||
(int8_t)_mm_extract_epi8(b.v, 0),
|
(int8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(int8_t)_mm_extract_epi8(a.v, 1) /
|
(int8_t)_mm_extract_epi8(a.v, 1) /
|
||||||
(int8_t)_mm_extract_epi8(b.v, 1),
|
(int8_t)_mm_extract_epi8(b.v, 1),
|
||||||
(int8_t)_mm_extract_epi8(a.v, 2) /
|
(int8_t)_mm_extract_epi8(a.v, 2) /
|
||||||
(int8_t)_mm_extract_epi8(b.v, 2),
|
(int8_t)_mm_extract_epi8(b.v, 2),
|
||||||
(int8_t)_mm_extract_epi8(a.v, 3) /
|
(int8_t)_mm_extract_epi8(a.v, 3) /
|
||||||
(int8_t)_mm_extract_epi8(b.v, 3));
|
(int8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __urem(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __urem(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) %
|
return __vec4_i8((uint8_t)_mm_extract_epi8(a.v, 0) %
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 0),
|
(uint8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 1) %
|
(uint8_t)_mm_extract_epi8(a.v, 1) %
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 1),
|
(uint8_t)_mm_extract_epi8(b.v, 1),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 2) %
|
(uint8_t)_mm_extract_epi8(a.v, 2) %
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 2),
|
(uint8_t)_mm_extract_epi8(b.v, 2),
|
||||||
@@ -424,9 +424,9 @@ static FORCEINLINE __vec4_i8 __urem(__vec4_i8 a, __vec4_i8 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __srem(__vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __srem(__vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) %
|
return __vec4_i8((int8_t)_mm_extract_epi8(a.v, 0) %
|
||||||
(int8_t)_mm_extract_epi8(b.v, 0),
|
(int8_t)_mm_extract_epi8(b.v, 0),
|
||||||
(int8_t)_mm_extract_epi8(a.v, 1) %
|
(int8_t)_mm_extract_epi8(a.v, 1) %
|
||||||
(int8_t)_mm_extract_epi8(b.v, 1),
|
(int8_t)_mm_extract_epi8(b.v, 1),
|
||||||
(int8_t)_mm_extract_epi8(a.v, 2) %
|
(int8_t)_mm_extract_epi8(a.v, 2) %
|
||||||
(int8_t)_mm_extract_epi8(b.v, 2),
|
(int8_t)_mm_extract_epi8(b.v, 2),
|
||||||
@@ -490,7 +490,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal_i8(__vec4_i8 a, __vec4_i8 b)
|
|||||||
(uint8_t)_mm_extract_epi8(b.v, 1),
|
(uint8_t)_mm_extract_epi8(b.v, 1),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 2) <=
|
(uint8_t)_mm_extract_epi8(a.v, 2) <=
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 2),
|
(uint8_t)_mm_extract_epi8(b.v, 2),
|
||||||
(uint8_t)_mm_extract_epi8(a.v, 3) <=
|
(uint8_t)_mm_extract_epi8(a.v, 3) <=
|
||||||
(uint8_t)_mm_extract_epi8(b.v, 3));
|
(uint8_t)_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -554,13 +554,13 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i8(__vec4_i8 a, __vec4_i8 b
|
|||||||
CMP_AND_MASK_INT(__vec4_i8, i8)
|
CMP_AND_MASK_INT(__vec4_i8, i8)
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
|
static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b) {
|
||||||
return __vec4_i8((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) :
|
return __vec4_i8((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) :
|
||||||
_mm_extract_epi8(b.v, 0),
|
_mm_extract_epi8(b.v, 0),
|
||||||
(_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi8(a.v, 1) :
|
(_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi8(a.v, 1) :
|
||||||
_mm_extract_epi8(b.v, 1),
|
_mm_extract_epi8(b.v, 1),
|
||||||
(_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi8(a.v, 2) :
|
(_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi8(a.v, 2) :
|
||||||
_mm_extract_epi8(b.v, 2),
|
_mm_extract_epi8(b.v, 2),
|
||||||
(_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi8(a.v, 3) :
|
(_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi8(a.v, 3) :
|
||||||
_mm_extract_epi8(b.v, 3));
|
_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -605,7 +605,7 @@ static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1,
|
static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
uint8_t r[4];
|
uint8_t r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -762,7 +762,7 @@ static FORCEINLINE __vec4_i1 __unsigned_less_equal_i16(__vec4_i16 a, __vec4_i16
|
|||||||
(uint16_t)_mm_extract_epi16(b.v, 1),
|
(uint16_t)_mm_extract_epi16(b.v, 1),
|
||||||
(uint16_t)_mm_extract_epi16(a.v, 2) <=
|
(uint16_t)_mm_extract_epi16(a.v, 2) <=
|
||||||
(uint16_t)_mm_extract_epi16(b.v, 2),
|
(uint16_t)_mm_extract_epi16(b.v, 2),
|
||||||
(uint16_t)_mm_extract_epi16(a.v, 3) <=
|
(uint16_t)_mm_extract_epi16(a.v, 3) <=
|
||||||
(uint16_t)_mm_extract_epi16(b.v, 3));
|
(uint16_t)_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -826,13 +826,13 @@ static FORCEINLINE __vec4_i1 __signed_greater_equal_i16(__vec4_i16 a, __vec4_i1
|
|||||||
CMP_AND_MASK_INT(__vec4_i16, i16)
|
CMP_AND_MASK_INT(__vec4_i16, i16)
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
|
static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16 b) {
|
||||||
return __vec4_i16((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) :
|
return __vec4_i16((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) :
|
||||||
_mm_extract_epi16(b.v, 0),
|
_mm_extract_epi16(b.v, 0),
|
||||||
(_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi16(a.v, 1) :
|
(_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi16(a.v, 1) :
|
||||||
_mm_extract_epi16(b.v, 1),
|
_mm_extract_epi16(b.v, 1),
|
||||||
(_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi16(a.v, 2) :
|
(_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi16(a.v, 2) :
|
||||||
_mm_extract_epi16(b.v, 2),
|
_mm_extract_epi16(b.v, 2),
|
||||||
(_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi16(a.v, 3) :
|
(_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi16(a.v, 3) :
|
||||||
_mm_extract_epi16(b.v, 3));
|
_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -877,7 +877,7 @@ static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1,
|
static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
uint16_t r[4];
|
uint16_t r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -950,13 +950,13 @@ _f___ii: ## @f___ii
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
*/
|
*/
|
||||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
|
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) <<
|
||||||
_mm_extract_epi32(b.v, 0),
|
_mm_extract_epi32(b.v, 0),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 1) <<
|
(uint32_t)_mm_extract_epi32(a.v, 1) <<
|
||||||
_mm_extract_epi32(b.v, 1),
|
_mm_extract_epi32(b.v, 1),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 2) <<
|
(uint32_t)_mm_extract_epi32(a.v, 2) <<
|
||||||
_mm_extract_epi32(b.v, 2),
|
_mm_extract_epi32(b.v, 2),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 3) <<
|
(uint32_t)_mm_extract_epi32(a.v, 3) <<
|
||||||
_mm_extract_epi32(b.v, 3));
|
_mm_extract_epi32(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -965,24 +965,24 @@ static FORCEINLINE __vec4_i32 __shl(__vec4_i32 a, int32_t b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i32 __udiv(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
|
return __vec4_i32((uint32_t)_mm_extract_epi32(a.v, 0) /
|
||||||
(uint32_t)_mm_extract_epi32(b.v, 0),
|
(uint32_t)_mm_extract_epi32(b.v, 0),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 1) /
|
(uint32_t)_mm_extract_epi32(a.v, 1) /
|
||||||
(uint32_t)_mm_extract_epi32(b.v, 1),
|
(uint32_t)_mm_extract_epi32(b.v, 1),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 2) /
|
(uint32_t)_mm_extract_epi32(a.v, 2) /
|
||||||
(uint32_t)_mm_extract_epi32(b.v, 2),
|
(uint32_t)_mm_extract_epi32(b.v, 2),
|
||||||
(uint32_t)_mm_extract_epi32(a.v, 3) /
|
(uint32_t)_mm_extract_epi32(a.v, 3) /
|
||||||
(uint32_t)_mm_extract_epi32(b.v, 3));
|
(uint32_t)_mm_extract_epi32(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i32 __sdiv(__vec4_i32 a, __vec4_i32 b) {
|
||||||
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
|
return __vec4_i32((int32_t)_mm_extract_epi32(a.v, 0) /
|
||||||
(int32_t)_mm_extract_epi32(b.v, 0),
|
(int32_t)_mm_extract_epi32(b.v, 0),
|
||||||
(int32_t)_mm_extract_epi32(a.v, 1) /
|
(int32_t)_mm_extract_epi32(a.v, 1) /
|
||||||
(int32_t)_mm_extract_epi32(b.v, 1),
|
(int32_t)_mm_extract_epi32(b.v, 1),
|
||||||
(int32_t)_mm_extract_epi32(a.v, 2) /
|
(int32_t)_mm_extract_epi32(a.v, 2) /
|
||||||
(int32_t)_mm_extract_epi32(b.v, 2),
|
(int32_t)_mm_extract_epi32(b.v, 2),
|
||||||
(int32_t)_mm_extract_epi32(a.v, 3) /
|
(int32_t)_mm_extract_epi32(a.v, 3) /
|
||||||
(int32_t)_mm_extract_epi32(b.v, 3));
|
(int32_t)_mm_extract_epi32(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1090,7 +1090,7 @@ static FORCEINLINE __vec4_i1 __signed_greater_than_i32(__vec4_i32 a, __vec4_i32
|
|||||||
CMP_AND_MASK_INT(__vec4_i32, i32)
|
CMP_AND_MASK_INT(__vec4_i32, i32)
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 b) {
|
static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 b) {
|
||||||
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v),
|
return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v),
|
||||||
_mm_castsi128_ps(a.v), mask.v));
|
_mm_castsi128_ps(a.v), mask.v));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1135,7 +1135,7 @@ static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1,
|
static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
uint32_t r[4];
|
uint32_t r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -1410,7 +1410,7 @@ static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1,
|
static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
uint64_t r[4];
|
uint64_t r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -1530,7 +1530,7 @@ static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1,
|
static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
float r[4];
|
float r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -1683,7 +1683,7 @@ static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
|
|||||||
__extract_element(v, __extract_element(index, 3) & 0x3));
|
__extract_element(v, __extract_element(index, 3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1,
|
static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1,
|
||||||
__vec4_i32 index) {
|
__vec4_i32 index) {
|
||||||
double r[4];
|
double r[4];
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
@@ -2115,7 +2115,7 @@ static FORCEINLINE __vec4_f __cast_fptrunc(__vec4_f, __vec4_d val) {
|
|||||||
|
|
||||||
static FORCEINLINE __vec4_d __cast_fpext(__vec4_d, __vec4_f val) {
|
static FORCEINLINE __vec4_d __cast_fpext(__vec4_d, __vec4_f val) {
|
||||||
return __vec4_d(_mm_cvtps_pd(val.v),
|
return __vec4_d(_mm_cvtps_pd(val.v),
|
||||||
_mm_cvtps_pd(_mm_shuffle_ps(val.v, val.v,
|
_mm_cvtps_pd(_mm_shuffle_ps(val.v, val.v,
|
||||||
_MM_SHUFFLE(3, 2, 3, 2))));
|
_MM_SHUFFLE(3, 2, 3, 2))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2435,12 +2435,12 @@ static FORCEINLINE int16_t __float_to_half_uniform(float f) {
|
|||||||
fint ^= sign;
|
fint ^= sign;
|
||||||
|
|
||||||
int32_t f32infty = 255 << 23;
|
int32_t f32infty = 255 << 23;
|
||||||
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
|
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
|
||||||
|
|
||||||
// (De)normalized number or zero
|
// (De)normalized number or zero
|
||||||
// update fint unconditionally to save the blending; we don't need it
|
// update fint unconditionally to save the blending; we don't need it
|
||||||
// anymore for the Inf/NaN case anyway.
|
// anymore for the Inf/NaN case anyway.
|
||||||
const uint32_t round_mask = ~0xfffu;
|
const uint32_t round_mask = ~0xfffu;
|
||||||
const int32_t magic = 15 << 23;
|
const int32_t magic = 15 << 23;
|
||||||
const int32_t f16infty = 31 << 23;
|
const int32_t f16infty = 31 << 23;
|
||||||
|
|
||||||
@@ -2791,7 +2791,7 @@ static FORCEINLINE __vec4_d __masked_load_double(void *p, __vec4_i1 mask) {
|
|||||||
return __vec4_d(v64);
|
return __vec4_d(v64);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
|
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
|
|
||||||
@@ -2833,7 +2833,7 @@ static FORCEINLINE void __masked_store_i16(void *p, __vec4_i16 val,
|
|||||||
ptr[3] = _mm_extract_epi16(val.v, 3);
|
ptr[3] = _mm_extract_epi16(val.v, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
|
static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -2858,23 +2858,23 @@ static FORCEINLINE void __masked_store_float(void *p, __vec4_f val,
|
|||||||
__masked_store_i32(p, __vec4_i32(val), mask);
|
__masked_store_i32(p, __vec4_i32(val), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
|
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
ptr[0] = _mm_extract_epi64(val.v[0], 0);
|
ptr[0] = _mm_extract_epi64(val.v[0], 0);
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
ptr[1] = _mm_extract_epi64(val.v[0], 1);
|
ptr[1] = _mm_extract_epi64(val.v[0], 1);
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
ptr[2] = _mm_extract_epi64(val.v[1], 0);
|
ptr[2] = _mm_extract_epi64(val.v[1], 0);
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2883,34 +2883,34 @@ static FORCEINLINE void __masked_store_double(void *p, __vec4_d val,
|
|||||||
__masked_store_i64(p, __vec4_i64(val), mask);
|
__masked_store_i64(p, __vec4_i64(val), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
|
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__masked_store_i8(p, val, mask);
|
__masked_store_i8(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val,
|
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__masked_store_i16(p, val, mask);
|
__masked_store_i16(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
|
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
// FIXME: do a load, blendvps, store here...
|
// FIXME: do a load, blendvps, store here...
|
||||||
__masked_store_i32(p, val, mask);
|
__masked_store_i32(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_float(void *p, __vec4_f val,
|
static FORCEINLINE void __masked_store_blend_float(void *p, __vec4_f val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__masked_store_i32(p, __vec4_i32(val), mask);
|
__masked_store_i32(p, __vec4_i32(val), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
|
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
// FIXME: do a 2x (load, blendvps, store) here...
|
// FIXME: do a 2x (load, blendvps, store) here...
|
||||||
__masked_store_i64(p, val, mask);
|
__masked_store_i64(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
|
static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__masked_store_i64(p, __vec4_i64(val), mask);
|
__masked_store_i64(p, __vec4_i64(val), mask);
|
||||||
}
|
}
|
||||||
@@ -2922,7 +2922,7 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
|
|||||||
|
|
||||||
template<typename RetVec, typename RetScalar>
|
template<typename RetVec, typename RetScalar>
|
||||||
static FORCEINLINE RetVec
|
static FORCEINLINE RetVec
|
||||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
||||||
__vec4_i32 offsets, __vec4_i1 mask) {
|
__vec4_i32 offsets, __vec4_i1 mask) {
|
||||||
RetScalar r[4];
|
RetScalar r[4];
|
||||||
#if 1
|
#if 1
|
||||||
@@ -2979,7 +2979,7 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
|||||||
|
|
||||||
template<typename RetVec, typename RetScalar>
|
template<typename RetVec, typename RetScalar>
|
||||||
static FORCEINLINE RetVec
|
static FORCEINLINE RetVec
|
||||||
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
||||||
__vec4_i64 offsets, __vec4_i1 mask) {
|
__vec4_i64 offsets, __vec4_i1 mask) {
|
||||||
RetScalar r[4];
|
RetScalar r[4];
|
||||||
#if 1
|
#if 1
|
||||||
@@ -3059,7 +3059,7 @@ __gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32
|
static FORCEINLINE __vec4_i32
|
||||||
__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
|
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
|
||||||
}
|
}
|
||||||
@@ -3071,7 +3071,7 @@ __gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f
|
static FORCEINLINE __vec4_f
|
||||||
__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
|
return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
|
||||||
}
|
}
|
||||||
@@ -3107,30 +3107,30 @@ __gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offs
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename RetVec, typename RetScalar>
|
template<typename RetVec, typename RetScalar>
|
||||||
static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs,
|
static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
RetScalar r[4];
|
RetScalar r[4];
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 0);
|
RetScalar *ptr = (RetScalar *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
r[0] = *ptr;
|
r[0] = *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 1);
|
RetScalar *ptr = (RetScalar *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
r[1] = *ptr;
|
r[1] = *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 2);
|
RetScalar *ptr = (RetScalar *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
r[2] = *ptr;
|
r[2] = *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
RetScalar *ptr = (RetScalar *)_mm_extract_epi32(ptrs.v, 3);
|
RetScalar *ptr = (RetScalar *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
r[3] = *ptr;
|
r[3] = *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3138,7 +3138,7 @@ static FORCEINLINE RetVec lGather32(RetVec, RetScalar, __vec4_i32 ptrs,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename RetVec, typename RetScalar>
|
template<typename RetVec, typename RetScalar>
|
||||||
static FORCEINLINE RetVec lGather64(RetVec, RetScalar, __vec4_i64 ptrs,
|
static FORCEINLINE RetVec lGather64(RetVec, RetScalar, __vec4_i64 ptrs,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
RetScalar r[4];
|
RetScalar r[4];
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
@@ -3185,25 +3185,25 @@ static FORCEINLINE __vec4_i32 __gather32_i32(__vec4_i32 ptrs, __vec4_i1 mask) {
|
|||||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 0);
|
int32_t *ptr = (int32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
r = _mm_insert_epi32(r, *ptr, 0);
|
r = _mm_insert_epi32(r, *ptr, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 1);
|
int32_t *ptr = (int32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
r = _mm_insert_epi32(r, *ptr, 1);
|
r = _mm_insert_epi32(r, *ptr, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 2);
|
int32_t *ptr = (int32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
r = _mm_insert_epi32(r, *ptr, 2);
|
r = _mm_insert_epi32(r, *ptr, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
int32_t *ptr = (int32_t *)_mm_extract_epi32(ptrs.v, 3);
|
int32_t *ptr = (int32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
r = _mm_insert_epi32(r, *ptr, 3);
|
r = _mm_insert_epi32(r, *ptr, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3265,7 +3265,7 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// scatter
|
// scatter
|
||||||
|
|
||||||
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
|
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
|
||||||
static FORCEINLINE void \
|
static FORCEINLINE void \
|
||||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
|
__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
|
||||||
@@ -3330,7 +3330,7 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
|
|||||||
|
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void
|
||||||
__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||||
__vec4_i64 val, __vec4_i1 mask) {
|
__vec4_i64 val, __vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
@@ -3362,7 +3362,7 @@ __scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offset
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void
|
||||||
__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||||
__vec4_i64 val, __vec4_i1 mask) {
|
__vec4_i64 val, __vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
@@ -3394,13 +3394,13 @@ __scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offset
|
|||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void
|
||||||
__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||||
__vec4_d val, __vec4_i1 mask) {
|
__vec4_d val, __vec4_i1 mask) {
|
||||||
__scatter_base_offsets32_i64(p, scale, offsets, val, mask);
|
__scatter_base_offsets32_i64(p, scale, offsets, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void
|
static FORCEINLINE void
|
||||||
__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||||
__vec4_d val, __vec4_i1 mask) {
|
__vec4_d val, __vec4_i1 mask) {
|
||||||
__scatter_base_offsets64_i64(p, scale, offsets, val, mask);
|
__scatter_base_offsets64_i64(p, scale, offsets, val, mask);
|
||||||
}
|
}
|
||||||
@@ -3410,25 +3410,25 @@ static FORCEINLINE void __scatter32_i8(__vec4_i32 ptrs, __vec4_i8 val,
|
|||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 0);
|
uint8_t *ptr = (uint8_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
*ptr = _mm_extract_epi8(val.v, 0);
|
*ptr = _mm_extract_epi8(val.v, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 1);
|
uint8_t *ptr = (uint8_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
*ptr = _mm_extract_epi8(val.v, 1);
|
*ptr = _mm_extract_epi8(val.v, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 2);
|
uint8_t *ptr = (uint8_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
*ptr = _mm_extract_epi8(val.v, 2);
|
*ptr = _mm_extract_epi8(val.v, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint8_t *ptr = (uint8_t *)_mm_extract_epi32(ptrs.v, 3);
|
uint8_t *ptr = (uint8_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
*ptr = _mm_extract_epi8(val.v, 3);
|
*ptr = _mm_extract_epi8(val.v, 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3464,25 +3464,25 @@ static FORCEINLINE void __scatter32_i16(__vec4_i32 ptrs, __vec4_i16 val,
|
|||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 0);
|
uint16_t *ptr = (uint16_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
*ptr = _mm_extract_epi16(val.v, 0);
|
*ptr = _mm_extract_epi16(val.v, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 1);
|
uint16_t *ptr = (uint16_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
*ptr = _mm_extract_epi16(val.v, 1);
|
*ptr = _mm_extract_epi16(val.v, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 2);
|
uint16_t *ptr = (uint16_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
*ptr = _mm_extract_epi16(val.v, 2);
|
*ptr = _mm_extract_epi16(val.v, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint16_t *ptr = (uint16_t *)_mm_extract_epi32(ptrs.v, 3);
|
uint16_t *ptr = (uint16_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
*ptr = _mm_extract_epi16(val.v, 3);
|
*ptr = _mm_extract_epi16(val.v, 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3518,25 +3518,25 @@ static FORCEINLINE void __scatter32_i32(__vec4_i32 ptrs, __vec4_i32 val,
|
|||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 0);
|
uint32_t *ptr = (uint32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
*ptr = _mm_extract_epi32(val.v, 0);
|
*ptr = _mm_extract_epi32(val.v, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 1);
|
uint32_t *ptr = (uint32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
*ptr = _mm_extract_epi32(val.v, 1);
|
*ptr = _mm_extract_epi32(val.v, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 2);
|
uint32_t *ptr = (uint32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
*ptr = _mm_extract_epi32(val.v, 2);
|
*ptr = _mm_extract_epi32(val.v, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint32_t *ptr = (uint32_t *)_mm_extract_epi32(ptrs.v, 3);
|
uint32_t *ptr = (uint32_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
*ptr = _mm_extract_epi32(val.v, 3);
|
*ptr = _mm_extract_epi32(val.v, 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3578,29 +3578,29 @@ static FORCEINLINE void __scatter64_float(__vec4_i64 ptrs, __vec4_f val,
|
|||||||
__scatter64_i32(ptrs, __vec4_i32(val), mask);
|
__scatter64_i32(ptrs, __vec4_i32(val), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __scatter32_i64(__vec4_i32 ptrs, __vec4_i64 val,
|
static FORCEINLINE void __scatter32_i64(__vec4_i32 ptrs, __vec4_i64 val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 0);
|
uint64_t *ptr = (uint64_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 0));
|
||||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 1);
|
uint64_t *ptr = (uint64_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 1));
|
||||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 2);
|
m = _mm_extract_ps(mask.v, 2);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 2);
|
uint64_t *ptr = (uint64_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 2));
|
||||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 3);
|
m = _mm_extract_ps(mask.v, 3);
|
||||||
if (m != 0) {
|
if (m != 0) {
|
||||||
uint64_t *ptr = (uint64_t *)_mm_extract_epi32(ptrs.v, 3);
|
uint64_t *ptr = (uint64_t *)((uintptr_t)_mm_extract_epi32(ptrs.v, 3));
|
||||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3632,12 +3632,12 @@ static FORCEINLINE void __scatter64_i64(__vec4_i64 ptrs, __vec4_i64 val,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __scatter32_double(__vec4_i32 ptrs, __vec4_d val,
|
static FORCEINLINE void __scatter32_double(__vec4_i32 ptrs, __vec4_d val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__scatter32_i64(ptrs, __vec4_i64(val), mask);
|
__scatter32_i64(ptrs, __vec4_i64(val), mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __scatter64_double(__vec4_i64 ptrs, __vec4_d val,
|
static FORCEINLINE void __scatter64_double(__vec4_i64 ptrs, __vec4_d val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
__scatter64_i64(ptrs, __vec4_i64(val), mask);
|
__scatter64_i64(ptrs, __vec4_i64(val), mask);
|
||||||
}
|
}
|
||||||
@@ -3648,11 +3648,11 @@ static FORCEINLINE void __scatter64_double(__vec4_i64 ptrs, __vec4_d val,
|
|||||||
static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec4_i32 *val,
|
static FORCEINLINE int32_t __packed_load_active(int32_t *ptr, __vec4_i32 *val,
|
||||||
__vec4_i1 mask) {
|
__vec4_i1 mask) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
val->v = _mm_insert_epi32(val->v, ptr[count++], 0);
|
val->v = _mm_insert_epi32(val->v, ptr[count++], 0);
|
||||||
|
|
||||||
m = _mm_extract_ps(mask.v, 1);
|
m = _mm_extract_ps(mask.v, 1);
|
||||||
if (m != 0)
|
if (m != 0)
|
||||||
val->v = _mm_insert_epi32(val->v, ptr[count++], 1);
|
val->v = _mm_insert_epi32(val->v, ptr[count++], 1);
|
||||||
|
|
||||||
@@ -3715,7 +3715,7 @@ static FORCEINLINE void __soa_to_aos3_float(__vec4_f v0, __vec4_f v1, __vec4_f v
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec4_f *out0,
|
static FORCEINLINE void __aos_to_soa3_float(float *ptr, __vec4_f *out0,
|
||||||
__vec4_f *out1, __vec4_f *out2) {
|
__vec4_f *out1, __vec4_f *out2) {
|
||||||
for (int i = 0; i < 4; ++i) {
|
for (int i = 0; i < 4; ++i) {
|
||||||
__insert_element(out0, i, *ptr++);
|
__insert_element(out0, i, *ptr++);
|
||||||
|
|||||||
Reference in New Issue
Block a user