Improve naming of masked load/store instructions in builtins.

Now, use _i32 suffixes, rather than _32, etc.  Also cleaned up the m4
macro to generate these functions, using WIDTH to get the target width,
etc.
This commit is contained in:
Matt Pharr
2012-06-07 13:51:08 -07:00
parent 91d22d150f
commit b86d40091a
13 changed files with 299 additions and 308 deletions

View File

@@ -1101,8 +1101,8 @@ REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i8 __masked_load_i8(void *p,
__vec16_i1 mask) {
__vec16_i8 ret;
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1111,8 +1111,8 @@ static FORCEINLINE __vec16_i8 __masked_load_8(void *p,
return ret;
}
static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i16 __masked_load_i16(void *p,
__vec16_i1 mask) {
__vec16_i16 ret;
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1121,8 +1121,8 @@ static FORCEINLINE __vec16_i16 __masked_load_16(void *p,
return ret;
}
static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
__vec16_i1 mask) {
__vec16_i32 ret;
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1131,8 +1131,8 @@ static FORCEINLINE __vec16_i32 __masked_load_32(void *p,
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
__vec16_i1 mask) {
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
__vec16_i1 mask) {
__vec16_i64 ret;
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1141,31 +1141,31 @@ static FORCEINLINE __vec16_i64 __masked_load_64(void *p,
return ret;
}
static FORCEINLINE void __masked_store_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
static FORCEINLINE void __masked_store_i16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 16; ++i)
@@ -1173,24 +1173,28 @@ static FORCEINLINE void __masked_store_64(void *p, __vec16_i64 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
__masked_store_8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec16_i16 val,
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
__masked_store_16(p, val, mask);
__masked_store_i8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
__masked_store_32(p, val, mask);
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec16_i16 val,
__vec16_i1 mask) {
__masked_store_i16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
__vec16_i1 mask) {
__masked_store_i32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_i64(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_64(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////

View File

@@ -2415,8 +2415,7 @@ static FORCEINLINE uint64_t __reduce_max_uint64(__vec4_i64 v) {
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
__vec4_i1 mask) {
static FORCEINLINE __vec4_i8 __masked_load_i8(void *p, __vec4_i1 mask) {
int8_t r[4];
int8_t *ptr = (int8_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2435,8 +2434,7 @@ static FORCEINLINE __vec4_i8 __masked_load_8(void *p,
return __vec4_i8(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
__vec4_i1 mask) {
static FORCEINLINE __vec4_i16 __masked_load_i16(void *p, __vec4_i1 mask) {
int16_t r[4];
int16_t *ptr = (int16_t *)p;
@@ -2459,8 +2457,7 @@ static FORCEINLINE __vec4_i16 __masked_load_16(void *p,
return __vec4_i16(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
__vec4_i1 mask) {
static FORCEINLINE __vec4_i32 __masked_load_i32(void *p, __vec4_i1 mask) {
__m128i r = _mm_set_epi32(0, 0, 0, 0);
int32_t *ptr = (int32_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2482,8 +2479,7 @@ static FORCEINLINE __vec4_i32 __masked_load_32(void *p,
return r;
}
static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
__vec4_i1 mask) {
static FORCEINLINE __vec4_i64 __masked_load_i64(void *p, __vec4_i1 mask) {
uint64_t r[4];
uint64_t *ptr = (uint64_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2505,8 +2501,8 @@ static FORCEINLINE __vec4_i64 __masked_load_64(void *p,
return __vec4_i64(r[0], r[1], r[2], r[3]);
}
static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
static FORCEINLINE void __masked_store_i8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
int8_t *ptr = (int8_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2526,8 +2522,8 @@ static FORCEINLINE void __masked_store_8(void *p, __vec4_i8 val,
ptr[3] = _mm_extract_epi8(val.v, 3);
}
static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
__vec4_i1 mask) {
static FORCEINLINE void __masked_store_i16(void *p, __vec4_i16 val,
__vec4_i1 mask) {
int16_t *ptr = (int16_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
@@ -2547,8 +2543,8 @@ static FORCEINLINE void __masked_store_16(void *p, __vec4_i16 val,
ptr[3] = _mm_extract_epi16(val.v, 3);
}
static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
static FORCEINLINE void __masked_store_i32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
int32_t *ptr = (int32_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0)
@@ -2567,8 +2563,8 @@ static FORCEINLINE void __masked_store_32(void *p, __vec4_i32 val,
ptr[3] = _mm_extract_epi32(val.v, 3);
}
static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
__vec4_i1 mask) {
static FORCEINLINE void __masked_store_i64(void *p, __vec4_i64 val,
__vec4_i1 mask) {
int64_t *ptr = (int64_t *)p;
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0)
@@ -2587,26 +2583,29 @@ static FORCEINLINE void __masked_store_64(void *p, __vec4_i64 val,
ptr[3] = _mm_extract_epi64(val.v[1], 1);
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
__masked_store_8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec4_i16 val,
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec4_i8 val,
__vec4_i1 mask) {
__masked_store_16(p, val, mask);
__masked_store_i8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec4_i16 val,
__vec4_i1 mask) {
__masked_store_i16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec4_i32 val,
__vec4_i1 mask) {
// FIXME: do a load, blendvps, store here...
__masked_store_32(p, val, mask);
__masked_store_i32(p, val, mask);
}
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec4_i64 val,
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec4_i64 val,
__vec4_i1 mask) {
// FIXME: do a 2x (load, blendvps, store) here...
__masked_store_64(p, val, mask);
__masked_store_i64(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////