Changed the C API to use templates to indicate memory alignment to the C compiler
This should help with performance of the generated code. Updated the relevant header files (sse4.h, generic-16.h, generic-32.h, generic-64.h) Updated generic-32.h and generic-64.h to the new memory API
This commit is contained in:
@@ -3950,9 +3950,8 @@ void CWriter::writeMemoryAccess(llvm::Value *Operand, llvm::Type *OperandType,
|
|||||||
void CWriter::visitLoadInst(llvm::LoadInst &I) {
|
void CWriter::visitLoadInst(llvm::LoadInst &I) {
|
||||||
llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getType());
|
llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getType());
|
||||||
if (VT != NULL) {
|
if (VT != NULL) {
|
||||||
Out << "__load(";
|
Out << "__load<" << I.getAlignment() << ">(";
|
||||||
writeOperand(I.getOperand(0));
|
writeOperand(I.getOperand(0));
|
||||||
Out << ", " << I.getAlignment();
|
|
||||||
Out << ")";
|
Out << ")";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -3964,11 +3963,11 @@ void CWriter::visitLoadInst(llvm::LoadInst &I) {
|
|||||||
void CWriter::visitStoreInst(llvm::StoreInst &I) {
|
void CWriter::visitStoreInst(llvm::StoreInst &I) {
|
||||||
llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getOperand(0)->getType());
|
llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getOperand(0)->getType());
|
||||||
if (VT != NULL) {
|
if (VT != NULL) {
|
||||||
Out << "__store(";
|
Out << "__store<" << I.getAlignment() << ">(";
|
||||||
writeOperand(I.getOperand(1));
|
writeOperand(I.getOperand(1));
|
||||||
Out << ", ";
|
Out << ", ";
|
||||||
writeOperand(I.getOperand(0));
|
writeOperand(I.getOperand(0));
|
||||||
Out << ", " << I.getAlignment() << ")";
|
Out << ")";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -211,14 +211,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define LOAD_STORE(VTYPE, STYPE) \
|
#define LOAD_STORE(VTYPE, STYPE) \
|
||||||
static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
|
template <int ALIGN> \
|
||||||
|
static FORCEINLINE VTYPE __load(VTYPE *p) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
for (int i = 0; i < 16; ++i) \
|
for (int i = 0; i < 16; ++i) \
|
||||||
ret.v[i] = ptr[i]; \
|
ret.v[i] = ptr[i]; \
|
||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) { \
|
template <int ALIGN> \
|
||||||
|
static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
for (int i = 0; i < 16; ++i) \
|
for (int i = 0; i < 16; ++i) \
|
||||||
ptr[i] = v.v[i]; \
|
ptr[i] = v.v[i]; \
|
||||||
@@ -380,14 +382,14 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
|||||||
vec->v |= (1 << index);
|
vec->v |= (1 << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p, int align) {
|
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec16_i1 r;
|
__vec16_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
*ptr = v.v;
|
*ptr = v.v;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -276,14 +276,16 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define LOAD_STORE(VTYPE, STYPE) \
|
#define LOAD_STORE(VTYPE, STYPE) \
|
||||||
static FORCEINLINE VTYPE __load(VTYPE *p, int align) { \
|
template <int ALIGN> \
|
||||||
|
static FORCEINLINE VTYPE __load(VTYPE *p) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
for (int i = 0; i < 32; ++i) \
|
for (int i = 0; i < 32; ++i) \
|
||||||
ret.v[i] = ptr[i]; \
|
ret.v[i] = ptr[i]; \
|
||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
static FORCEINLINE void __store(VTYPE *p, VTYPE v, int align) { \
|
template <int ALIGN> \
|
||||||
|
static FORCEINLINE void __store(VTYPE *p, VTYPE v) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
for (int i = 0; i < 32; ++i) \
|
for (int i = 0; i < 32; ++i) \
|
||||||
ptr[i] = v.v[i]; \
|
ptr[i] = v.v[i]; \
|
||||||
@@ -445,14 +447,14 @@ static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index,
|
|||||||
vec->v |= (1 << index);
|
vec->v |= (1 << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p, int align) {
|
template <int ALIGN> static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec32_i1 r;
|
__vec32_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
*ptr = v.v;
|
*ptr = v.v;
|
||||||
}
|
}
|
||||||
@@ -1168,7 +1170,7 @@ REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// masked load/store
|
// masked load/store
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i8 __masked_load_8(void *p,
|
static FORCEINLINE __vec32_i8 __masked_load_i8(void *p,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__vec32_i8 ret;
|
__vec32_i8 ret;
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
@@ -1178,7 +1180,7 @@ static FORCEINLINE __vec32_i8 __masked_load_8(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i16 __masked_load_16(void *p,
|
static FORCEINLINE __vec32_i16 __masked_load_i16(void *p,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__vec32_i16 ret;
|
__vec32_i16 ret;
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
@@ -1188,7 +1190,7 @@ static FORCEINLINE __vec32_i16 __masked_load_16(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i32 __masked_load_32(void *p,
|
static FORCEINLINE __vec32_i32 __masked_load_i32(void *p,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__vec32_i32 ret;
|
__vec32_i32 ret;
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
@@ -1198,7 +1200,7 @@ static FORCEINLINE __vec32_i32 __masked_load_32(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec32_i64 __masked_load_64(void *p,
|
static FORCEINLINE __vec32_i64 __masked_load_i64(void *p,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__vec32_i64 ret;
|
__vec32_i64 ret;
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
@@ -1208,7 +1210,27 @@ static FORCEINLINE __vec32_i64 __masked_load_64(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_8(void *p, __vec32_i8 val,
|
static FORCEINLINE __vec32_f __masked_load_float(void *p,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__vec32_f ret;
|
||||||
|
float *ptr = (float *)p;
|
||||||
|
for (int i = 0; i < 32; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ret.v[i] = ptr[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec32_d __masked_load_double(void *p,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__vec32_d ret;
|
||||||
|
double *ptr = (double *)p;
|
||||||
|
for (int i = 0; i < 32; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ret.v[i] = ptr[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_i8(void *p, __vec32_i8 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
for (int i = 0; i < 32; ++i)
|
for (int i = 0; i < 32; ++i)
|
||||||
@@ -1216,7 +1238,7 @@ static FORCEINLINE void __masked_store_8(void *p, __vec32_i8 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_16(void *p, __vec32_i16 val,
|
static FORCEINLINE void __masked_store_i16(void *p, __vec32_i16 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
for (int i = 0; i < 32; ++i)
|
for (int i = 0; i < 32; ++i)
|
||||||
@@ -1224,7 +1246,7 @@ static FORCEINLINE void __masked_store_16(void *p, __vec32_i16 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_32(void *p, __vec32_i32 val,
|
static FORCEINLINE void __masked_store_i32(void *p, __vec32_i32 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
for (int i = 0; i < 32; ++i)
|
for (int i = 0; i < 32; ++i)
|
||||||
@@ -1232,7 +1254,7 @@ static FORCEINLINE void __masked_store_32(void *p, __vec32_i32 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_64(void *p, __vec32_i64 val,
|
static FORCEINLINE void __masked_store_i64(void *p, __vec32_i64 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
for (int i = 0; i < 32; ++i)
|
for (int i = 0; i < 32; ++i)
|
||||||
@@ -1240,24 +1262,50 @@ static FORCEINLINE void __masked_store_64(void *p, __vec32_i64 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec32_i8 val,
|
static FORCEINLINE void __masked_store_float(void *p, __vec32_f val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
float *ptr = (float *)p;
|
||||||
|
for (int i = 0; i < 32; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ptr[i] = val.v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_double(void *p, __vec32_d val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
double *ptr = (double *)p;
|
||||||
|
for (int i = 0; i < 32; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ptr[i] = val.v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec32_i8 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__masked_store_8(p, val, mask);
|
__masked_store_i8(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec32_i16 val,
|
static FORCEINLINE void __masked_store_blend_i16(void *p, __vec32_i16 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__masked_store_16(p, val, mask);
|
__masked_store_i16(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec32_i32 val,
|
static FORCEINLINE void __masked_store_blend_i32(void *p, __vec32_i32 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__masked_store_32(p, val, mask);
|
__masked_store_i32(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec32_i64 val,
|
static FORCEINLINE void __masked_store_blend_i64(void *p, __vec32_i64 val,
|
||||||
__vec32_i1 mask) {
|
__vec32_i1 mask) {
|
||||||
__masked_store_64(p, val, mask);
|
__masked_store_i64(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_float(void *p, __vec32_f val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__masked_store_float(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__masked_store_double(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -1287,8 +1335,12 @@ GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __gather_base_offsets32_i
|
|||||||
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
|
GATHER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __gather_base_offsets64_i16)
|
||||||
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
|
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __gather_base_offsets32_i32)
|
||||||
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
|
GATHER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __gather_base_offsets64_i32)
|
||||||
|
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __gather_base_offsets32_float)
|
||||||
|
GATHER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __gather_base_offsets64_float)
|
||||||
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
|
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __gather_base_offsets32_i64)
|
||||||
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
|
GATHER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __gather_base_offsets64_i64)
|
||||||
|
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __gather_base_offsets32_double)
|
||||||
|
GATHER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __gather_base_offsets64_double)
|
||||||
|
|
||||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \
|
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec32_i1 mask) { \
|
||||||
@@ -1307,8 +1359,12 @@ GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __gather32_i16)
|
|||||||
GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __gather64_i16)
|
GATHER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __gather64_i16)
|
||||||
GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __gather32_i32)
|
GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __gather32_i32)
|
||||||
GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __gather64_i32)
|
GATHER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __gather64_i32)
|
||||||
|
GATHER_GENERAL(__vec32_f, float, __vec32_i32, __gather32_float)
|
||||||
|
GATHER_GENERAL(__vec32_f, float, __vec32_i64, __gather64_float)
|
||||||
GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __gather32_i64)
|
GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __gather32_i64)
|
||||||
GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __gather64_i64)
|
GATHER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __gather64_i64)
|
||||||
|
GATHER_GENERAL(__vec32_d, double, __vec32_i32, __gather32_double)
|
||||||
|
GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
|
||||||
|
|
||||||
// scatter
|
// scatter
|
||||||
|
|
||||||
@@ -1332,8 +1388,12 @@ SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i32, __scatter_base_offsets32
|
|||||||
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
|
SCATTER_BASE_OFFSETS(__vec32_i16, int16_t, __vec32_i64, __scatter_base_offsets64_i16)
|
||||||
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
|
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i32, __scatter_base_offsets32_i32)
|
||||||
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
|
SCATTER_BASE_OFFSETS(__vec32_i32, int32_t, __vec32_i64, __scatter_base_offsets64_i32)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i32, __scatter_base_offsets32_float)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec32_f, float, __vec32_i64, __scatter_base_offsets64_float)
|
||||||
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
|
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i32, __scatter_base_offsets32_i64)
|
||||||
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
|
SCATTER_BASE_OFFSETS(__vec32_i64, int64_t, __vec32_i64, __scatter_base_offsets64_i64)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i32, __scatter_base_offsets32_double)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec32_d, double, __vec32_i64, __scatter_base_offsets64_double)
|
||||||
|
|
||||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \
|
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec32_i1 mask) { \
|
||||||
@@ -1351,8 +1411,12 @@ SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i32, __scatter32_i16)
|
|||||||
SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __scatter64_i16)
|
SCATTER_GENERAL(__vec32_i16, int16_t, __vec32_i64, __scatter64_i16)
|
||||||
SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __scatter32_i32)
|
SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i32, __scatter32_i32)
|
||||||
SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __scatter64_i32)
|
SCATTER_GENERAL(__vec32_i32, int32_t, __vec32_i64, __scatter64_i32)
|
||||||
|
SCATTER_GENERAL(__vec32_f, float, __vec32_i32, __scatter32_float)
|
||||||
|
SCATTER_GENERAL(__vec32_f, float, __vec32_i64, __scatter64_float)
|
||||||
SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __scatter32_i64)
|
SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i32, __scatter32_i64)
|
||||||
SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __scatter64_i64)
|
SCATTER_GENERAL(__vec32_i64, int64_t, __vec32_i64, __scatter64_i64)
|
||||||
|
SCATTER_GENERAL(__vec32_d, double, __vec32_i32, __scatter32_double)
|
||||||
|
SCATTER_GENERAL(__vec32_d, double, __vec32_i64, __scatter64_double)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// packed load/store
|
// packed load/store
|
||||||
|
|||||||
@@ -570,14 +570,14 @@ static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index,
|
|||||||
vec->v |= (1ull << index);
|
vec->v |= (1ull << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p, int align) {
|
template <int ALIGN> static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec64_i1 r;
|
__vec64_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
*ptr = v.v;
|
*ptr = v.v;
|
||||||
}
|
}
|
||||||
@@ -1297,7 +1297,7 @@ REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// masked load/store
|
// masked load/store
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i8 __masked_load_8(void *p,
|
static FORCEINLINE __vec64_i8 __masked_load_i8(void *p,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__vec64_i8 ret;
|
__vec64_i8 ret;
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
@@ -1307,7 +1307,7 @@ static FORCEINLINE __vec64_i8 __masked_load_8(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i16 __masked_load_16(void *p,
|
static FORCEINLINE __vec64_i16 __masked_load_i16(void *p,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__vec64_i16 ret;
|
__vec64_i16 ret;
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
@@ -1317,7 +1317,7 @@ static FORCEINLINE __vec64_i16 __masked_load_16(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i32 __masked_load_32(void *p,
|
static FORCEINLINE __vec64_i32 __masked_load_i32(void *p,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__vec64_i32 ret;
|
__vec64_i32 ret;
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
@@ -1327,7 +1327,7 @@ static FORCEINLINE __vec64_i32 __masked_load_32(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec64_i64 __masked_load_64(void *p,
|
static FORCEINLINE __vec64_i64 __masked_load_i64(void *p,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__vec64_i64 ret;
|
__vec64_i64 ret;
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
@@ -1337,7 +1337,27 @@ static FORCEINLINE __vec64_i64 __masked_load_64(void *p,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_8(void *p, __vec64_i8 val,
|
static FORCEINLINE __vec64_f __masked_load_float(void *p,
|
||||||
|
__vec64_i1 mask) {
|
||||||
|
__vec64_f ret;
|
||||||
|
float *ptr = (float *)p;
|
||||||
|
for (int i = 0; i < 64; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ret.v[i] = ptr[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec64_d __masked_load_double(void *p,
|
||||||
|
__vec64_i1 mask) {
|
||||||
|
__ve64_d ret;
|
||||||
|
double *ptr = (double *)p;
|
||||||
|
for (int i = 0; i < 64; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ret.v[i] = ptr[i];
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_i8(void *p, __vec64_i8 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
int8_t *ptr = (int8_t *)p;
|
int8_t *ptr = (int8_t *)p;
|
||||||
for (int i = 0; i < 64; ++i)
|
for (int i = 0; i < 64; ++i)
|
||||||
@@ -1345,7 +1365,7 @@ static FORCEINLINE void __masked_store_8(void *p, __vec64_i8 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_16(void *p, __vec64_i16 val,
|
static FORCEINLINE void __masked_store_i16(void *p, __vec64_i16 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
int16_t *ptr = (int16_t *)p;
|
int16_t *ptr = (int16_t *)p;
|
||||||
for (int i = 0; i < 64; ++i)
|
for (int i = 0; i < 64; ++i)
|
||||||
@@ -1353,7 +1373,7 @@ static FORCEINLINE void __masked_store_16(void *p, __vec64_i16 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_32(void *p, __vec64_i32 val,
|
static FORCEINLINE void __masked_store_i32(void *p, __vec64_i32 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
int32_t *ptr = (int32_t *)p;
|
int32_t *ptr = (int32_t *)p;
|
||||||
for (int i = 0; i < 64; ++i)
|
for (int i = 0; i < 64; ++i)
|
||||||
@@ -1361,7 +1381,7 @@ static FORCEINLINE void __masked_store_32(void *p, __vec64_i32 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_64(void *p, __vec64_i64 val,
|
static FORCEINLINE void __masked_store_i64(void *p, __vec64_i64 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
int64_t *ptr = (int64_t *)p;
|
int64_t *ptr = (int64_t *)p;
|
||||||
for (int i = 0; i < 64; ++i)
|
for (int i = 0; i < 64; ++i)
|
||||||
@@ -1369,24 +1389,50 @@ static FORCEINLINE void __masked_store_64(void *p, __vec64_i64 val,
|
|||||||
ptr[i] = val.v[i];
|
ptr[i] = val.v[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_8(void *p, __vec64_i8 val,
|
static FORCEINLINE void __masked_store_float(void *p, __vec64_f val,
|
||||||
|
__vec64_i1 mask) {
|
||||||
|
float *ptr = (float *)p;
|
||||||
|
for (int i = 0; i < 64; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ptr[i] = val.v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_double(void *p, __vec64_d val,
|
||||||
|
__vec64_i1 mask) {
|
||||||
|
double *ptr = (double *)p;
|
||||||
|
for (int i = 0; i < 64; ++i)
|
||||||
|
if ((mask.v & (1 << i)) != 0)
|
||||||
|
ptr[i] = val.v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_i8(void *p, __vec64_i8 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__masked_store_8(p, val, mask);
|
__masked_store_i8(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_16(void *p, __vec64_i16 val,
|
static FORCEINLINE void __masked_store_blend_16(void *p, __vec64_i16 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__masked_store_16(p, val, mask);
|
__masked_store_i16(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_32(void *p, __vec64_i32 val,
|
static FORCEINLINE void __masked_store_blend_32(void *p, __vec64_i32 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__masked_store_32(p, val, mask);
|
__masked_store_i32(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __masked_store_blend_64(void *p, __vec64_i64 val,
|
static FORCEINLINE void __masked_store_blend_64(void *p, __vec64_i64 val,
|
||||||
__vec64_i1 mask) {
|
__vec64_i1 mask) {
|
||||||
__masked_store_64(p, val, mask);
|
__masked_store_i64(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_float(void *p, __vec32_f val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__masked_store_float(p, val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
|
||||||
|
__vec32_i1 mask) {
|
||||||
|
__masked_store_double(p, val, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
@@ -1416,8 +1462,12 @@ GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __gather_base_offsets32_i
|
|||||||
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
|
GATHER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __gather_base_offsets64_i16)
|
||||||
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
|
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __gather_base_offsets32_i32)
|
||||||
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
|
GATHER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __gather_base_offsets64_i32)
|
||||||
|
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __gather_base_offsets32_float)
|
||||||
|
GATHER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __gather_base_offsets64_float)
|
||||||
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
|
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __gather_base_offsets32_i64)
|
||||||
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
|
GATHER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __gather_base_offsets64_i64)
|
||||||
|
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __gather_base_offsets32_double)
|
||||||
|
GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_double)
|
||||||
|
|
||||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
|
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
|
||||||
@@ -1436,8 +1486,12 @@ GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __gather32_i16)
|
|||||||
GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __gather64_i16)
|
GATHER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __gather64_i16)
|
||||||
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __gather32_i32)
|
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __gather32_i32)
|
||||||
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __gather64_i32)
|
GATHER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __gather64_i32)
|
||||||
|
GATHER_GENERAL(__vec64_f, float, __vec64_i32, __gather32_float)
|
||||||
|
GATHER_GENERAL(__vec64_f, float, __vec64_i64, __gather64_float)
|
||||||
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __gather32_i64)
|
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __gather32_i64)
|
||||||
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __gather64_i64)
|
GATHER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __gather64_i64)
|
||||||
|
GATHER_GENERAL(__vec64_d, double, __vec64_i32, __gather32_double)
|
||||||
|
GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
|
||||||
|
|
||||||
// scatter
|
// scatter
|
||||||
|
|
||||||
@@ -1461,8 +1515,12 @@ SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i32, __scatter_base_offsets32
|
|||||||
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
|
SCATTER_BASE_OFFSETS(__vec64_i16, int16_t, __vec64_i64, __scatter_base_offsets64_i16)
|
||||||
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
|
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i32, __scatter_base_offsets32_i32)
|
||||||
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
|
SCATTER_BASE_OFFSETS(__vec64_i32, int32_t, __vec64_i64, __scatter_base_offsets64_i32)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i32, __scatter_base_offsets32_float)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec64_f, float, __vec64_i64, __scatter_base_offsets64_float)
|
||||||
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
|
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i32, __scatter_base_offsets32_i64)
|
||||||
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
|
SCATTER_BASE_OFFSETS(__vec64_i64, int64_t, __vec64_i64, __scatter_base_offsets64_i64)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i32, __scatter_base_offsets32_double)
|
||||||
|
SCATTER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __scatter_base_offsets64_double)
|
||||||
|
|
||||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) \
|
||||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \
|
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec64_i1 mask) { \
|
||||||
@@ -1480,8 +1538,12 @@ SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i32, __scatter32_i16)
|
|||||||
SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __scatter64_i16)
|
SCATTER_GENERAL(__vec64_i16, int16_t, __vec64_i64, __scatter64_i16)
|
||||||
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __scatter32_i32)
|
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i32, __scatter32_i32)
|
||||||
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __scatter64_i32)
|
SCATTER_GENERAL(__vec64_i32, int32_t, __vec64_i64, __scatter64_i32)
|
||||||
|
SCATTER_GENERAL(__vec64_f, float, __vec64_i32, __scatter32_float)
|
||||||
|
SCATTER_GENERAL(__vec64_f, float, __vec64_i64, __scatter64_float)
|
||||||
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __scatter32_i64)
|
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i32, __scatter32_i64)
|
||||||
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __scatter64_i64)
|
SCATTER_GENERAL(__vec64_i64, int64_t, __vec64_i64, __scatter64_i64)
|
||||||
|
SCATTER_GENERAL(__vec64_d, double, __vec64_i32, __scatter32_double)
|
||||||
|
SCATTER_GENERAL(__vec64_d, double, __vec64_i64, __scatter64_double)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// packed load/store
|
// packed load/store
|
||||||
|
|||||||
@@ -287,12 +287,12 @@ static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
|
|||||||
((int32_t *)v)[index] = val ? -1 : 0;
|
((int32_t *)v)[index] = val ? -1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_ps((float *)(&v->v));
|
return _mm_loadu_ps((float *)(&v->v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 value) {
|
||||||
// FIXME: handle align
|
// FIXME: handle align
|
||||||
_mm_storeu_ps((float *)(&p->v), value.v);
|
_mm_storeu_ps((float *)(&p->v), value.v);
|
||||||
}
|
}
|
||||||
@@ -556,12 +556,12 @@ static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1,
|
|||||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v) {
|
||||||
uint8_t *ptr = (uint8_t *)(&v->v);
|
uint8_t *ptr = (uint8_t *)(&v->v);
|
||||||
return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
|
return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_i8 *p, __vec4_i8 value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_i8 *p, __vec4_i8 value) {
|
||||||
uint8_t *ptr = (uint8_t *)(&p->v);
|
uint8_t *ptr = (uint8_t *)(&p->v);
|
||||||
ptr[0] = _mm_extract_epi8(value.v, 0);
|
ptr[0] = _mm_extract_epi8(value.v, 0);
|
||||||
ptr[1] = _mm_extract_epi8(value.v, 1);
|
ptr[1] = _mm_extract_epi8(value.v, 1);
|
||||||
@@ -815,12 +815,12 @@ static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1,
|
|||||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v) {
|
||||||
uint16_t *ptr = (uint16_t *)(&v->v);
|
uint16_t *ptr = (uint16_t *)(&v->v);
|
||||||
return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
|
return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_i16 *p, __vec4_i16 value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_i16 *p, __vec4_i16 value) {
|
||||||
uint16_t *ptr = (uint16_t *)(&p->v);
|
uint16_t *ptr = (uint16_t *)(&p->v);
|
||||||
ptr[0] = _mm_extract_epi16(value.v, 0);
|
ptr[0] = _mm_extract_epi16(value.v, 0);
|
||||||
ptr[1] = _mm_extract_epi16(value.v, 1);
|
ptr[1] = _mm_extract_epi16(value.v, 1);
|
||||||
@@ -1060,12 +1060,12 @@ static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1,
|
|||||||
return __vec4_i32(r[0], r[1], r[2], r[3]);
|
return __vec4_i32(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_si128((__m128i *)(&v->v));
|
return _mm_loadu_si128((__m128i *)(&v->v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __store(__vec4_i32 *p, __vec4_i32 value, int align) {
|
template <int ALIGN> static void __store(__vec4_i32 *p, __vec4_i32 value) {
|
||||||
// FIXME: handle align
|
// FIXME: handle align
|
||||||
_mm_storeu_si128((__m128i *)(&p->v), value.v);
|
_mm_storeu_si128((__m128i *)(&p->v), value.v);
|
||||||
}
|
}
|
||||||
@@ -1322,13 +1322,13 @@ static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1,
|
|||||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
|
return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
|
||||||
_mm_loadu_si128((__m128i *)(&v->v[1])));
|
_mm_loadu_si128((__m128i *)(&v->v[1])));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_i64 *p, __vec4_i64 value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_i64 *p, __vec4_i64 value) {
|
||||||
// FIXME: handle align
|
// FIXME: handle align
|
||||||
_mm_storeu_si128((__m128i *)(&p->v[0]), value.v[0]);
|
_mm_storeu_si128((__m128i *)(&p->v[0]), value.v[0]);
|
||||||
_mm_storeu_si128((__m128i *)(&p->v[1]), value.v[1]);
|
_mm_storeu_si128((__m128i *)(&p->v[1]), value.v[1]);
|
||||||
@@ -1425,12 +1425,12 @@ static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1,
|
|||||||
return __vec4_f(r[0], r[1], r[2], r[3]);
|
return __vec4_f(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f __load(__vec4_f *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_f __load(__vec4_f *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_ps((float *)(&v->v));
|
return _mm_loadu_ps((float *)(&v->v));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_f *p, __vec4_f value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_f *p, __vec4_f value) {
|
||||||
// FIXME: handle align
|
// FIXME: handle align
|
||||||
_mm_storeu_ps((float *)(&p->v), value.v);
|
_mm_storeu_ps((float *)(&p->v), value.v);
|
||||||
}
|
}
|
||||||
@@ -1558,13 +1558,13 @@ static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1,
|
|||||||
return __vec4_d(r[0], r[1], r[2], r[3]);
|
return __vec4_d(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_d __load(__vec4_d *v, int align) {
|
template <int ALIGN> static FORCEINLINE __vec4_d __load(__vec4_d *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
|
return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
|
||||||
_mm_loadu_pd((double *)(&v->v[1])));
|
_mm_loadu_pd((double *)(&v->v[1])));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE void __store(__vec4_d *p, __vec4_d value, int align) {
|
template <int ALIGN> static FORCEINLINE void __store(__vec4_d *p, __vec4_d value) {
|
||||||
// FIXME: handle align
|
// FIXME: handle align
|
||||||
_mm_storeu_pd((double *)(&p->v[0]), value.v[0]);
|
_mm_storeu_pd((double *)(&p->v[0]), value.v[0]);
|
||||||
_mm_storeu_pd((double *)(&p->v[1]), value.v[1]);
|
_mm_storeu_pd((double *)(&p->v[1]), value.v[1]);
|
||||||
|
|||||||
Reference in New Issue
Block a user