66
cbackend.cpp
66
cbackend.cpp
@@ -227,6 +227,8 @@ namespace {
|
|||||||
const llvm::TargetData* TD;
|
const llvm::TargetData* TD;
|
||||||
|
|
||||||
std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
|
std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
|
||||||
|
std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
|
||||||
|
unsigned VectorConstantIndex;
|
||||||
std::set<llvm::Function*> intrinsicPrototypesAlreadyGenerated;
|
std::set<llvm::Function*> intrinsicPrototypesAlreadyGenerated;
|
||||||
std::set<const llvm::Argument*> ByValParams;
|
std::set<const llvm::Argument*> ByValParams;
|
||||||
unsigned FPCounter;
|
unsigned FPCounter;
|
||||||
@@ -253,6 +255,7 @@ namespace {
|
|||||||
vectorWidth(vecwidth) {
|
vectorWidth(vecwidth) {
|
||||||
initializeLoopInfoPass(*llvm::PassRegistry::getPassRegistry());
|
initializeLoopInfoPass(*llvm::PassRegistry::getPassRegistry());
|
||||||
FPCounter = 0;
|
FPCounter = 0;
|
||||||
|
VectorConstantIndex = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual const char *getPassName() const { return "C backend"; }
|
virtual const char *getPassName() const { return "C backend"; }
|
||||||
@@ -278,6 +281,10 @@ namespace {
|
|||||||
// Output all floating point constants that cannot be printed accurately.
|
// Output all floating point constants that cannot be printed accurately.
|
||||||
printFloatingPointConstants(F);
|
printFloatingPointConstants(F);
|
||||||
|
|
||||||
|
// Output all vector constants so they can be accessed with single
|
||||||
|
// vector loads
|
||||||
|
printVectorConstants(F);
|
||||||
|
|
||||||
printFunction(F);
|
printFunction(F);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -292,6 +299,7 @@ namespace {
|
|||||||
delete MRI;
|
delete MRI;
|
||||||
delete MOFI;
|
delete MOFI;
|
||||||
FPConstantMap.clear();
|
FPConstantMap.clear();
|
||||||
|
VectorConstantMap.clear();
|
||||||
ByValParams.clear();
|
ByValParams.clear();
|
||||||
intrinsicPrototypesAlreadyGenerated.clear();
|
intrinsicPrototypesAlreadyGenerated.clear();
|
||||||
UnnamedStructIDs.clear();
|
UnnamedStructIDs.clear();
|
||||||
@@ -350,6 +358,7 @@ namespace {
|
|||||||
void printContainedArrays(llvm::ArrayType *ATy, llvm::SmallPtrSet<llvm::Type *, 16> &);
|
void printContainedArrays(llvm::ArrayType *ATy, llvm::SmallPtrSet<llvm::Type *, 16> &);
|
||||||
void printFloatingPointConstants(llvm::Function &F);
|
void printFloatingPointConstants(llvm::Function &F);
|
||||||
void printFloatingPointConstants(const llvm::Constant *C);
|
void printFloatingPointConstants(const llvm::Constant *C);
|
||||||
|
void printVectorConstants(llvm::Function &F);
|
||||||
void printFunctionSignature(const llvm::Function *F, bool Prototype);
|
void printFunctionSignature(const llvm::Function *F, bool Prototype);
|
||||||
|
|
||||||
void printFunction(llvm::Function &);
|
void printFunction(llvm::Function &);
|
||||||
@@ -1511,6 +1520,22 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
|
|||||||
printConstant(splatValue, Static);
|
printConstant(splatValue, Static);
|
||||||
Out << ")";
|
Out << ")";
|
||||||
}
|
}
|
||||||
|
else if (VectorConstantMap.find(CDV) != VectorConstantMap.end()) {
|
||||||
|
// If we have emitted an static const array with the
|
||||||
|
// vector's values, just load from it.
|
||||||
|
unsigned index = VectorConstantMap[CDV];
|
||||||
|
int alignment = 4 * std::min(vectorWidth, 16);
|
||||||
|
|
||||||
|
Out << "__load<" << alignment << ">(";
|
||||||
|
|
||||||
|
// Cast the pointer to the array of element values to a
|
||||||
|
// pointer to the vector type.
|
||||||
|
Out << "(const ";
|
||||||
|
printSimpleType(Out, CDV->getType(), true, "");
|
||||||
|
Out << " *)";
|
||||||
|
|
||||||
|
Out << "(VectorConstant" << index << "))";
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
printType(Out, CPV->getType());
|
printType(Out, CPV->getType());
|
||||||
Out << "(";
|
Out << "(";
|
||||||
@@ -2515,6 +2540,47 @@ void CWriter::printFloatingPointConstants(const llvm::Constant *C) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// For any vector constants, generate code to declare static const arrays
|
||||||
|
// with their element values. Doing so allows us to emit aligned vector
|
||||||
|
// loads to get their values, rather than tediously inserting the
|
||||||
|
// individual values into the vector.
|
||||||
|
void CWriter::printVectorConstants(llvm::Function &F) {
|
||||||
|
// LLVM 3.1 and beyond have a different representation of constant
|
||||||
|
// vectors than before--here we will only do this for 3.1 and later, as
|
||||||
|
// the separate code path isn't worth the trouble. This will hurt
|
||||||
|
// performance with 3.0 builds, though they should still generate
|
||||||
|
// correct code.
|
||||||
|
#ifndef LLVM_3_0
|
||||||
|
for (llvm::constant_iterator I = constant_begin(&F), E = constant_end(&F);
|
||||||
|
I != E; ++I) {
|
||||||
|
const llvm::ConstantDataVector *CDV = llvm::dyn_cast<llvm::ConstantDataVector>(*I);
|
||||||
|
if (CDV == NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Don't bother if this is a splat of the same value; a (more
|
||||||
|
// efficient?) __splat_* call will be generated for these.
|
||||||
|
if (CDV->getSplatValue() != NULL)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Don't align to anything more than 64 bytes
|
||||||
|
int alignment = 4 * std::min(vectorWidth, 16);
|
||||||
|
|
||||||
|
Out << "static const ";
|
||||||
|
printSimpleType(Out, CDV->getElementType(), true, "");
|
||||||
|
Out << "__attribute__ ((aligned(" << alignment << "))) ";
|
||||||
|
Out << "VectorConstant" << VectorConstantIndex << "[] = { ";
|
||||||
|
for (int i = 0; i < (int)CDV->getNumElements(); ++i) {
|
||||||
|
printConstant(CDV->getElementAsConstant(i), false);
|
||||||
|
Out << ", ";
|
||||||
|
}
|
||||||
|
Out << " };\n";
|
||||||
|
|
||||||
|
VectorConstantMap[CDV] = VectorConstantIndex++;
|
||||||
|
}
|
||||||
|
Out << "\n";
|
||||||
|
#endif // !LLVM_3_0
|
||||||
|
}
|
||||||
|
|
||||||
/// printSymbolTable - Run through symbol table looking for type names. If a
|
/// printSymbolTable - Run through symbol table looking for type names. If a
|
||||||
/// type name is found, emit its declaration...
|
/// type name is found, emit its declaration...
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -212,7 +212,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
|||||||
|
|
||||||
#define LOAD_STORE(VTYPE, STYPE) \
|
#define LOAD_STORE(VTYPE, STYPE) \
|
||||||
template <int ALIGN> \
|
template <int ALIGN> \
|
||||||
static FORCEINLINE VTYPE __load(VTYPE *p) { \
|
static FORCEINLINE VTYPE __load(const VTYPE *p) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
for (int i = 0; i < 16; ++i) \
|
for (int i = 0; i < 16; ++i) \
|
||||||
@@ -395,7 +395,7 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
|||||||
vec->v |= (1 << index);
|
vec->v |= (1 << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec16_i1 r;
|
__vec16_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
|
|||||||
@@ -277,7 +277,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
|||||||
|
|
||||||
#define LOAD_STORE(VTYPE, STYPE) \
|
#define LOAD_STORE(VTYPE, STYPE) \
|
||||||
template <int ALIGN> \
|
template <int ALIGN> \
|
||||||
static FORCEINLINE VTYPE __load(VTYPE *p) { \
|
static FORCEINLINE VTYPE __load(const VTYPE *p) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
for (int i = 0; i < 32; ++i) \
|
for (int i = 0; i < 32; ++i) \
|
||||||
@@ -460,7 +460,7 @@ static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index,
|
|||||||
vec->v |= (1 << index);
|
vec->v |= (1 << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p) {
|
template <int ALIGN> static FORCEINLINE __vec32_i1 __load(const __vec32_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec32_i1 r;
|
__vec32_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
|
|||||||
@@ -402,7 +402,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
|
|||||||
|
|
||||||
#define LOAD_STORE(VTYPE, STYPE) \
|
#define LOAD_STORE(VTYPE, STYPE) \
|
||||||
template <int ALIGN> \
|
template <int ALIGN> \
|
||||||
static FORCEINLINE VTYPE __load(VTYPE *p) { \
|
static FORCEINLINE VTYPE __load(const VTYPE *p) { \
|
||||||
STYPE *ptr = (STYPE *)p; \
|
STYPE *ptr = (STYPE *)p; \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
for (int i = 0; i < 64; ++i) \
|
for (int i = 0; i < 64; ++i) \
|
||||||
@@ -585,7 +585,7 @@ static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index,
|
|||||||
vec->v |= (1ull << index);
|
vec->v |= (1ull << index);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p) {
|
template <int ALIGN> static FORCEINLINE __vec64_i1 __load(const __vec64_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
uint16_t *ptr = (uint16_t *)p;
|
||||||
__vec64_i1 r;
|
__vec64_i1 r;
|
||||||
r.v = *ptr;
|
r.v = *ptr;
|
||||||
|
|||||||
@@ -465,8 +465,8 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
|
||||||
uint16_t *ptr = (uint16_t *)p;
|
const uint16_t *ptr = (const uint16_t *)p;
|
||||||
__vec16_i1 r;
|
__vec16_i1 r;
|
||||||
r.m = *ptr;
|
r.m = *ptr;
|
||||||
return r;
|
return r;
|
||||||
@@ -729,14 +729,14 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_i32 __load(__vec16_i32 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
|
||||||
__vec16_i32 v;
|
__vec16_i32 v;
|
||||||
v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE __vec16_i32 __load<64>(__vec16_i32 *p) {
|
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
|
||||||
return _mm512_load_epi32(p);
|
return _mm512_load_epi32(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -827,7 +827,7 @@ SHUFFLES(__vec16_i64, i64, int64_t)
|
|||||||
LOAD_STORE(__vec16_i64, int64_t)
|
LOAD_STORE(__vec16_i64, int64_t)
|
||||||
|
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
|
||||||
__m512i v1;
|
__m512i v1;
|
||||||
__m512i v2;
|
__m512i v2;
|
||||||
v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||||
@@ -851,7 +851,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
|
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
|
||||||
__m512i v2 = _mm512_load_epi32(p);
|
__m512i v2 = _mm512_load_epi32(p);
|
||||||
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
__m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
|
||||||
__vec16_i64 ret;
|
__vec16_i64 ret;
|
||||||
@@ -1015,14 +1015,14 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f v0, __vec16_f v1, __vec1
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_f __load(__vec16_f *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
|
||||||
__vec16_f v;
|
__vec16_f v;
|
||||||
v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||||
v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE __vec16_f __load<64>(__vec16_f *p) {
|
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
|
||||||
return _mm512_load_ps(p);
|
return _mm512_load_ps(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1184,7 +1184,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_d v0, __vec16_d v1, __vec1
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec16_d __load(__vec16_d *p) {
|
template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
|
||||||
__vec16_d ret;
|
__vec16_d ret;
|
||||||
ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||||
ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||||
@@ -1193,7 +1193,7 @@ template <int ALIGN> static FORCEINLINE __vec16_d __load(__vec16_d *p) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <> static FORCEINLINE __vec16_d __load<64>(__vec16_d *p) {
|
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
|
||||||
__vec16_d ret;
|
__vec16_d ret;
|
||||||
ret.v1 = _mm512_load_pd(p);
|
ret.v1 = _mm512_load_pd(p);
|
||||||
ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
|
ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
|
||||||
|
|||||||
@@ -287,7 +287,7 @@ static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
|
|||||||
((int32_t *)v)[index] = val ? -1 : 0;
|
((int32_t *)v)[index] = val ? -1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_ps((float *)(&v->v));
|
return _mm_loadu_ps((float *)(&v->v));
|
||||||
}
|
}
|
||||||
@@ -573,7 +573,7 @@ static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1,
|
|||||||
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
return __vec4_i8(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i8 __load(const __vec4_i8 *v) {
|
||||||
uint8_t *ptr = (uint8_t *)(&v->v);
|
uint8_t *ptr = (uint8_t *)(&v->v);
|
||||||
return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
|
return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
|
||||||
}
|
}
|
||||||
@@ -840,7 +840,7 @@ static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1,
|
|||||||
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
return __vec4_i16(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i16 __load(const __vec4_i16 *v) {
|
||||||
uint16_t *ptr = (uint16_t *)(&v->v);
|
uint16_t *ptr = (uint16_t *)(&v->v);
|
||||||
return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
|
return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
|
||||||
}
|
}
|
||||||
@@ -1093,7 +1093,7 @@ static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1,
|
|||||||
return __vec4_i32(r[0], r[1], r[2], r[3]);
|
return __vec4_i32(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i32 __load(const __vec4_i32 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_si128((__m128i *)(&v->v));
|
return _mm_loadu_si128((__m128i *)(&v->v));
|
||||||
}
|
}
|
||||||
@@ -1363,7 +1363,7 @@ static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1,
|
|||||||
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
return __vec4_i64(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i64 __load(const __vec4_i64 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
|
return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
|
||||||
_mm_loadu_si128((__m128i *)(&v->v[1])));
|
_mm_loadu_si128((__m128i *)(&v->v[1])));
|
||||||
@@ -1474,7 +1474,7 @@ static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1,
|
|||||||
return __vec4_f(r[0], r[1], r[2], r[3]);
|
return __vec4_f(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_f __load(__vec4_f *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_f __load(const __vec4_f *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return _mm_loadu_ps((float *)(&v->v));
|
return _mm_loadu_ps((float *)(&v->v));
|
||||||
}
|
}
|
||||||
@@ -1615,7 +1615,7 @@ static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1,
|
|||||||
return __vec4_d(r[0], r[1], r[2], r[3]);
|
return __vec4_d(r[0], r[1], r[2], r[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_d __load(__vec4_d *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_d __load(const __vec4_d *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
|
return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
|
||||||
_mm_loadu_pd((double *)(&v->v[1])));
|
_mm_loadu_pd((double *)(&v->v[1])));
|
||||||
|
|||||||
Reference in New Issue
Block a user