Add separate variants of memory built-ins for floats and doubles.

Previously, we'd bitcast e.g. a vector of floats to a vector of i32s and then
use the i32 variant of masked_load/masked_store/gather/scatter.  Now, we have
separate float/double variants of each of those.
This commit is contained in:
Matt Pharr
2012-06-07 14:29:17 -07:00
parent 1ac3e03171
commit 89a2566e01
17 changed files with 593 additions and 41 deletions

View File

@@ -1131,6 +1131,16 @@ static FORCEINLINE __vec16_i32 __masked_load_i32(void *p,
return ret;
}
static FORCEINLINE __vec16_f __masked_load_float(void *p,
__vec16_i1 mask) {
__vec16_f ret;
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
__vec16_i1 mask) {
__vec16_i64 ret;
@@ -1141,6 +1151,16 @@ static FORCEINLINE __vec16_i64 __masked_load_i64(void *p,
return ret;
}
static FORCEINLINE __vec16_d __masked_load_double(void *p,
__vec16_i1 mask) {
__vec16_d ret;
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE void __masked_store_i8(void *p, __vec16_i8 val,
__vec16_i1 mask) {
int8_t *ptr = (int8_t *)p;
@@ -1165,6 +1185,14 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
__vec16_i1 mask) {
float *ptr = (float *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
int64_t *ptr = (int64_t *)p;
@@ -1173,6 +1201,12 @@ static FORCEINLINE void __masked_store_i64(void *p, __vec16_i64 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
__vec16_i1 mask) {
double *ptr = (double *)p;
for (int i = 0; i < 16; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec16_i8 val,
@@ -1190,11 +1224,19 @@ static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val,
__masked_store_i32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
__vec16_i1 mask) {
__masked_store_float(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec16_i64 val,
__vec16_i1 mask) {
__masked_store_i64(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
__vec16_i1 mask) {
__masked_store_double(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////
@@ -1224,8 +1266,12 @@ GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec16_i1 mask) { \
@@ -1244,8 +1290,12 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __gather32_i16)
GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
GATHER_GENERAL(__vec16_f, float, __vec16_i32, __gather32_float)
GATHER_GENERAL(__vec16_f, float, __vec16_i64, __gather64_float)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
GATHER_GENERAL(__vec16_d, double, __vec16_i32, __gather32_double)
GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
// scatter
@@ -1269,8 +1319,12 @@ SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec16_f, float, __vec16_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec16_d, double, __vec16_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
@@ -1288,8 +1342,12 @@ SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __scatter32_i32)
SCATTER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __scatter64_i32)
SCATTER_GENERAL(__vec16_f, float, __vec16_i32, __scatter32_float)
SCATTER_GENERAL(__vec16_f, float, __vec16_i64, __scatter64_float)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __scatter32_i64)
SCATTER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __scatter64_i64)
SCATTER_GENERAL(__vec16_d, double, __vec16_i32, __scatter32_double)
SCATTER_GENERAL(__vec16_d, double, __vec16_i64, __scatter64_double)
///////////////////////////////////////////////////////////////////////////
// packed load/store