Changed the C API to use templates to indicate memory alignment to the C compiler

This should help with performance of the generated code.
Updated the relevant header files (sse4.h, generic-16.h, generic-32.h, generic-64.h)

Updated generic-32.h and generic-64.h to the new memory API
This commit is contained in:
Jean-Luc Duprat
2012-06-28 09:29:15 -07:00
committed by Matt Pharr
parent d34a87404d
commit e431b07e04
5 changed files with 188 additions and 59 deletions

View File

@@ -211,14 +211,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
}
#define LOAD_STORE(VTYPE, STYPE) \
static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
template <int ALIGN> \
static FORCEINLINE VTYPE __load(VTYPE *p) { \
STYPE *ptr = (STYPE *)p; \
VTYPE ret; \
for (int i = 0; i < 16; ++i) \
ret.v[i] = ptr[i]; \
return ret; \
} \
static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) { \
template <int ALIGN> \
static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \
STYPE *ptr = (STYPE *)p; \
for (int i = 0; i < 16; ++i) \
ptr[i] = v.v[i]; \
@@ -380,14 +382,14 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
vec->v |= (1 << index);
}
static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
uint16_t *ptr = (uint16_t *)p;
__vec16_i1 r;
r.v = *ptr;
return r;
}
static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
uint16_t *ptr = (uint16_t *)p;
*ptr = v.v;
}