Improve naming of masked load/store instructions in builtins.
Now, use _i32 suffixes, rather than _32, etc. Also cleaned up the m4 macro to generate these functions, using WIDTH to get the target width, etc.
This commit is contained in:
@@ -1101,8 +1101,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i8 ret;
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1111,8 +1111,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i16 ret;
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1121,8 +1121,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 ret;
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1131,8 +1131,8 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i64 ret;
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1141,31 +1141,31 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
if ((mask.v & (1 << i)) != 0)
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
@@ -1173,24 +1173,28 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
|
||||
ptr[i] = val.v[i];
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
|
||||
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
__masked_store_i8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_32(p, val, mask);
|
||||
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i32(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_i64(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
|
||||
__vec16_i1 mask) {
|
||||
__masked_store_64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -2415,8 +2415,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// masked load/store
|
||||
|
||||
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE __vec4_i8 __masked_load_i8(void *p, __vec4_i1 mask) {
|
||||
int8_t r[4];
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2435,8 +2434,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
|
||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE __vec4_i16 __masked_load_i16(void *p, __vec4_i1 mask) {
|
||||
int16_t r[4];
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
|
||||
@@ -2459,8 +2457,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
|
||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE __vec4_i32 __masked_load_i32(void *p, __vec4_i1 mask) {
|
||||
__m128i r = _mm_set_epi32(0, 0, 0, 0);
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2482,8 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
|
||||
return r;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) {
|
||||
uint64_t r[4];
|
||||
uint64_t *ptr = (uint64_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2505,8 +2501,8 @@ static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
|
||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
int8_t *ptr = (int8_t *)p;
|
||||
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2526,8 +2522,8 @@ static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
|
||||
ptr[3] = _mm_extract_epi8(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
int16_t *ptr = (int16_t *)p;
|
||||
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
@@ -2547,8 +2543,8 @@ static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
|
||||
ptr[3] = _mm_extract_epi16(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
int32_t *ptr = (int32_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0)
|
||||
@@ -2567,8 +2563,8 @@ static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
|
||||
ptr[3] = _mm_extract_epi32(val.v, 3);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
int64_t *ptr = (int64_t *)p;
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0)
|
||||
@@ -2587,26 +2583,29 @@ static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
|
||||
ptr[3] = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
|
||||
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_16(p, val, mask);
|
||||
__masked_store_i8(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val,
|
||||
__vec4_i1 mask) {
|
||||
__masked_store_i16(p, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a load, blendvps, store here...
|
||||
__masked_store_32(p, val, mask);
|
||||
__masked_store_i32(p, val, mask);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
|
||||
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
// FIXME: do a 2x (load, blendvps, store) here...
|
||||
__masked_store_64(p, val, mask);
|
||||
__masked_store_i64(p, val, mask);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Reference in New Issue
Block a user