Changed the C API to use templates to indicate memory alignment to the C compiler

This should help with performance of the generated code.
Updated the relevant header files (sse4.h, generic-16.h, generic-32.h, generic-64.h)

Updated generic-32.h and generic-64.h to the new memory API
This commit is contained in:
Jean-Luc Duprat
2012-06-27 16:59:26 -07:00
parent b6a8d0ee7f
commit f1b3703389
5 changed files with 184 additions and 57 deletions

View File

@@ -570,14 +570,14 @@ static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index,
vec->v |= (1ull << index);
}
static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p, int align) {
template <int ALIGN> static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p) {
uint16_t *ptr = (uint16_t *)p;
__vec64_i1 r;
r.v = *ptr;
return r;
}
static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v, int align) {
template <int ALIGN> static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v) {
uint16_t *ptr = (uint16_t *)p;
*ptr = v.v;
}
@@ -1297,7 +1297,7 @@ REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
///////////////////////////////////////////////////////////////////////////
// masked load/store
static FORCEINLINE __vec64_i8 __masked_load_8(void *p,
static FORCEINLINE __vec64_i8 __masked_load_i8(void *p,
__vec64_i1 mask) {
__vec64_i8 ret;
int8_t *ptr = (int8_t *)p;
@@ -1307,7 +1307,7 @@ static FORCEINLINE __vec64_i8 __masked_load_8(void *p,
return ret;
}
static FORCEINLINE __vec64_i16 __masked_load_16(void *p,
static FORCEINLINE __vec64_i16 __masked_load_i16(void *p,
__vec64_i1 mask) {
__vec64_i16 ret;
int16_t *ptr = (int16_t *)p;
@@ -1317,7 +1317,7 @@ static FORCEINLINE __vec64_i16 __masked_load_16(void *p,
return ret;
}
static FORCEINLINE __vec64_i32 __masked_load_32(void *p,
static FORCEINLINE __vec64_i32 __masked_load_i32(void *p,
__vec64_i1 mask) {
__vec64_i32 ret;
int32_t *ptr = (int32_t *)p;
@@ -1327,7 +1327,7 @@ static FORCEINLINE __vec64_i32 __masked_load_32(void *p,
return ret;
}
static FORCEINLINE __vec64_i64 __masked_load_64(void *p,
static FORCEINLINE __vec64_i64 __masked_load_i64(void *p,
__vec64_i1 mask) {
__vec64_i64 ret;
int64_t *ptr = (int64_t *)p;
@@ -1337,7 +1337,27 @@ static FORCEINLINE __vec64_i64 __masked_load_64(void *p,
return ret;
}
static FORCEINLINE void __masked_store_8(void *p, __vec64_i8 val,
static FORCEINLINE __vec64_f __masked_load_float(void *p,
__vec64_i1 mask) {
__vec64_f ret;
float *ptr = (float *)p;
for (int i = 0; i < 64; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE __vec64_d __masked_load_double(void *p,
__vec64_i1 mask) {
__ve64_d ret;
double *ptr = (double *)p;
for (int i = 0; i < 64; ++i)
if ((mask.v & (1 << i)) != 0)
ret.v[i] = ptr[i];
return ret;
}
static FORCEINLINE void __masked_store_i8(void *p, __vec64_i8 val,
__vec64_i1 mask) {
int8_t *ptr = (int8_t *)p;
for (int i = 0; i < 64; ++i)
@@ -1345,7 +1365,7 @@ static FORCEINLINE void __masked_store_8(void *p, __vec64_i8 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_16(void *p, __vec64_i16 val,
static FORCEINLINE void __masked_store_i16(void *p, __vec64_i16 val,
__vec64_i1 mask) {
int16_t *ptr = (int16_t *)p;
for (int i = 0; i < 64; ++i)
@@ -1353,7 +1373,7 @@ static FORCEINLINE void __masked_store_16(void *p, __vec64_i16 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_32(void *p, __vec64_i32 val,
static FORCEINLINE void __masked_store_i32(void *p, __vec64_i32 val,
__vec64_i1 mask) {
int32_t *ptr = (int32_t *)p;
for (int i = 0; i < 64; ++i)
@@ -1361,7 +1381,7 @@ static FORCEINLINE void __masked_store_32(void *p, __vec64_i32 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_64(void *p, __vec64_i64 val,
static FORCEINLINE void __masked_store_i64(void *p, __vec64_i64 val,
__vec64_i1 mask) {
int64_t *ptr = (int64_t *)p;
for (int i = 0; i < 64; ++i)
@@ -1369,24 +1389,50 @@ static FORCEINLINE void __masked_store_64(void *p, __vec64_i64 val,
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_8(void *p, __vec64_i8 val,
static FORCEINLINE void __masked_store_float(void *p, __vec64_f val,
__vec64_i1 mask) {
float *ptr = (float *)p;
for (int i = 0; i < 64; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_double(void *p, __vec64_d val,
__vec64_i1 mask) {
double *ptr = (double *)p;
for (int i = 0; i < 64; ++i)
if ((mask.v & (1 << i)) != 0)
ptr[i] = val.v[i];
}
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec64_i8 val,
__vec64_i1 mask) {
__masked_store_8(p, val, mask);
__masked_store_i8(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_16(void *p, __vec64_i16 val,
__vec64_i1 mask) {
__masked_store_16(p, val, mask);
__masked_store_i16(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_32(void *p, __vec64_i32 val,
__vec64_i1 mask) {
__masked_store_32(p, val, mask);
__masked_store_i32(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_64(void *p, __vec64_i64 val,
__vec64_i1 mask) {
__masked_store_64(p, val, mask);
__masked_store_i64(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_float(void *p, __vec32_f val,
__vec32_i1 mask) {
__masked_store_float(p, val, mask);
}
static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
__vec32_i1 mask) {
__masked_store_double(p, val, mask);
}
///////////////////////////////////////////////////////////////////////////
@@ -1416,8 +1462,12 @@ GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
@@ -1436,8 +1486,12 @@ GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __gather32_i16)
GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __gather64_i16)
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __gather32_i32)
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __gather64_i32)
GATHER_GENERAL(__vec64_f, float, __vec64_i32, __gather32_float)
GATHER_GENERAL(__vec64_f, float, __vec64_i64, __gather64_float)
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __gather32_i64)
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __gather64_i64)
GATHER_GENERAL(__vec64_d, double, __vec64_i32, __gather32_double)
GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
// scatter
@@ -1461,8 +1515,12 @@ SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \
@@ -1480,8 +1538,12 @@ SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __scatter32_i16)
SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __scatter64_i16)
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __scatter32_i32)
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __scatter64_i32)
SCATTER_GENERAL(__vec64_f, float, __vec64_i32, __scatter32_float)
SCATTER_GENERAL(__vec64_f, float, __vec64_i64, __scatter64_float)
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __scatter32_i64)
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __scatter64_i64)
SCATTER_GENERAL(__vec64_d, double, __vec64_i32, __scatter32_double)
SCATTER_GENERAL(__vec64_d, double, __vec64_i64, __scatter64_double)
///////////////////////////////////////////////////////////////////////////
// packed load/store