diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index cfbe6678..48e7b836 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -41,12 +41,26 @@ reduce_equal(WIDTH) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle -declare @__smear_float(, float) nounwind readnone -declare @__smear_double(, double) nounwind readnone -declare @__smear_i8(, i8) nounwind readnone -declare @__smear_i16(, i16) nounwind readnone -declare @__smear_i32(, i32) nounwind readnone -declare @__smear_i64(, i64) nounwind readnone +declare @__smear_float(float) nounwind readnone +declare @__smear_double(double) nounwind readnone +declare @__smear_i8(i8) nounwind readnone +declare @__smear_i16(i16) nounwind readnone +declare @__smear_i32(i32) nounwind readnone +declare @__smear_i64(i64) nounwind readnone + +declare @__setzero_float() nounwind readnone +declare @__setzero_double() nounwind readnone +declare @__setzero_i8() nounwind readnone +declare @__setzero_i16() nounwind readnone +declare @__setzero_i32() nounwind readnone +declare @__setzero_i64() nounwind readnone + +declare @__undef_float() nounwind readnone +declare @__undef_double() nounwind readnone +declare @__undef_i8() nounwind readnone +declare @__undef_i16() nounwind readnone +declare @__undef_i32() nounwind readnone +declare @__undef_i64() nounwind readnone declare @__broadcast_float(, i32) nounwind readnone declare @__broadcast_double(, i32) nounwind readnone diff --git a/cbackend.cpp b/cbackend.cpp index f409d88f..5467d1e2 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -1098,22 +1098,31 @@ bool CWriter::printCast(unsigned opc, llvm::Type *SrcTy, llvm::Type *DstTy) { } -// FIXME: generalize this/make it not so hard-coded? -static const char *lGetSmearFunc(llvm::Type *matchType) { +/** Construct the name of a function with the given base and returning a + vector of a given type. For example, if base is "foo" and matchType is + i16, this will return the string "__foo_i16". + */ +static const char * +lGetTypedFunc(const char *base, llvm::Type *matchType) { + char buf[64]; + sprintf(buf, "__%s_", base); switch (matchType->getTypeID()) { - case llvm::Type::FloatTyID: return "__smear_float"; - case llvm::Type::DoubleTyID: return "__smear_double"; + case llvm::Type::FloatTyID: strcat(buf, "float"); break; + case llvm::Type::DoubleTyID: strcat(buf, "double"); break; case llvm::Type::IntegerTyID: { switch (llvm::cast(matchType)->getBitWidth()) { - case 1: return "__smear_i1"; - case 8: return "__smear_i8"; - case 16: return "__smear_i16"; - case 32: return "__smear_i32"; - case 64: return "__smear_i64"; + case 1: strcat(buf, "i1"); break; + case 8: strcat(buf, "i8"); break; + case 16: strcat(buf, "i16"); break; + case 32: strcat(buf, "i32"); break; + case 64: strcat(buf, "i64"); break; + default: return NULL; } + break; } default: return NULL; } + return strdup(buf); } @@ -1458,64 +1467,63 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) { } case llvm::Type::VectorTyID: { llvm::VectorType *VT = llvm::dyn_cast(CPV->getType()); - const char *smearFunc = lGetSmearFunc(VT->getElementType()); if (llvm::isa(CPV)) { - assert(smearFunc != NULL); - - llvm::Constant *CZ = llvm::Constant::getNullValue(VT->getElementType()); - Out << smearFunc << "("; - printType(Out, VT); - Out << "(), "; - printConstant(CZ, Static); - Out << ")"; + // All zeros; call the __setzero_* function. + const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType()); + assert(setZeroFunc != NULL); + Out << setZeroFunc << "()"; } - else if (llvm::ConstantVector *CV = llvm::dyn_cast(CPV)) { - llvm::Constant *splatValue = CV->getSplatValue(); - if (splatValue != NULL && smearFunc != NULL) { - Out << smearFunc << "("; - printType(Out, VT); - Out << "(), "; - printConstant(splatValue, Static); - Out << ")"; - } - else { - printType(Out, CPV->getType()); - Out << "("; - printConstantVector(CV, Static); - Out << ")"; - } + else if (llvm::isa(CPV)) { + // Undefined value; call __undef_* so that we can potentially pass + // this information along.. + const char *undefFunc = lGetTypedFunc("undef", VT->getElementType()); + assert(undefFunc != NULL); + Out << undefFunc << "()"; } -#ifndef LLVM_3_0 - else if (llvm::ConstantDataVector *CDV = llvm::dyn_cast(CPV)) { - llvm::Constant *splatValue = CDV->getSplatValue(); - if (splatValue != NULL && smearFunc != NULL) { - Out << smearFunc << "("; - printType(Out, VT); - Out << "(), "; - printConstant(splatValue, Static); - Out << ")"; - } - else { - printType(Out, CPV->getType()); - Out << "("; - printConstantDataSequential(CDV, Static); - Out << ")"; - } - } -#endif // !LLVM_3_0 else { - assert(llvm::isa(CPV)); - llvm::Constant *CZ = llvm::Constant::getNullValue(VT->getElementType()); - printType(Out, CPV->getType()); - Out << "("; - printConstant(CZ, Static); - for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) { - Out << ", "; - printConstant(CZ, Static); - } - Out << ")"; + const char *smearFunc = lGetTypedFunc("smear", VT->getElementType()); + + if (llvm::ConstantVector *CV = llvm::dyn_cast(CPV)) { + llvm::Constant *splatValue = CV->getSplatValue(); + if (splatValue != NULL && smearFunc != NULL) { + // If it's a basic type and has a __smear_* function, then + // call that. + Out << smearFunc << "("; + printConstant(splatValue, Static); + Out << ")"; + } + else { + // Otherwise call the constructor for the type + printType(Out, CPV->getType()); + Out << "("; + printConstantVector(CV, Static); + Out << ")"; + } + } +#ifndef LLVM_3_0 + // LLVM 3.1 and beyond have a different representation of constant vectors.. + else if (llvm::ConstantDataVector *CDV = + llvm::dyn_cast(CPV)) { + llvm::Constant *splatValue = CDV->getSplatValue(); + if (splatValue != NULL && smearFunc != NULL) { + Out << smearFunc << "("; + printConstant(splatValue, Static); + Out << ")"; + } + else { + printType(Out, CPV->getType()); + Out << "("; + printConstantDataSequential(CDV, Static); + Out << ")"; + } + } +#endif // !LLVM_3_0 + else { + llvm::report_fatal_error("Unexpected vector type"); + } } + break; } case llvm::Type::StructTyID: @@ -4233,7 +4241,7 @@ char SmearCleanupPass::ID = 0; static int - lChainLength(llvm::InsertElementInst *inst) { +lChainLength(llvm::InsertElementInst *inst) { int length = 0; while (inst != NULL) { ++length; @@ -4281,24 +4289,26 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { { llvm::Type *matchType = toMatch->getType(); - const char *smearFuncName = lGetSmearFunc(matchType); + const char *smearFuncName = lGetTypedFunc("smear", matchType); if (smearFuncName != NULL) { llvm::Function *smearFunc = module->getFunction(smearFuncName); if (smearFunc == NULL) { + // Declare the smar function if needed; it takes a single + // scalar parameter and returns a vector of the same + // parameter type. llvm::Constant *sf = module->getOrInsertFunction(smearFuncName, iter->getType(), - iter->getType(), matchType, NULL); + matchType, NULL); smearFunc = llvm::dyn_cast(sf); assert(smearFunc != NULL); smearFunc->setDoesNotThrow(true); smearFunc->setDoesNotAccessMemory(true); } - llvm::Value *undefResult = llvm::UndefValue::get(vt); assert(smearFunc != NULL); - llvm::Value *args[2] = { undefResult, toMatch }; - llvm::ArrayRef argArray(&args[0], &args[2]); + llvm::Value *args[1] = { toMatch }; + llvm::ArrayRef argArray(&args[0], &args[1]); llvm::Instruction *smearCall = llvm::CallInst::Create(smearFunc, argArray, LLVMGetName(toMatch, "_smear"), (llvm::Instruction *)NULL); diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index bb5b0dc3..1737063f 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -262,13 +262,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \ +static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 16; ++i) \ ret.v[i] = v; \ return ret; \ } +#define SETZERO(VTYPE, NAME) \ +static FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret.v[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +static FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + #define BROADCAST(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ VTYPE ret; \ @@ -394,11 +407,21 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *ptr = v.v; } -static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int v) { +static FORCEINLINE __vec16_i1 __smear_i1(int v) { return __vec16_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v); } +static FORCEINLINE __vec16_i1 __setzero_i1() { + return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); +} + + +static FORCEINLINE __vec16_i1 __undef_i1() { + return __vec16_i1(); +} + /////////////////////////////////////////////////////////////////////////// // int8 @@ -438,6 +461,8 @@ CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec16_i8) INSERT_EXTRACT(__vec16_i8, int8_t) SMEAR(__vec16_i8, i8, int8_t) +SETZERO(__vec16_i8, i8) +UNDEF(__vec16_i8, i8) BROADCAST(__vec16_i8, i8, int8_t) ROTATE(__vec16_i8, i8, int8_t) SHUFFLES(__vec16_i8, i8, int8_t) @@ -481,6 +506,8 @@ CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec16_i16) INSERT_EXTRACT(__vec16_i16, int16_t) SMEAR(__vec16_i16, i16, int16_t) +SETZERO(__vec16_i16, i16) +UNDEF(__vec16_i16, i16) BROADCAST(__vec16_i16, i16, int16_t) ROTATE(__vec16_i16, i16, int16_t) SHUFFLES(__vec16_i16, i16, int16_t) @@ -524,6 +551,8 @@ CMP_OP(__vec16_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec16_i32) INSERT_EXTRACT(__vec16_i32, int32_t) SMEAR(__vec16_i32, i32, int32_t) +SETZERO(__vec16_i32, i32) +UNDEF(__vec16_i32, i32) BROADCAST(__vec16_i32, i32, int32_t) ROTATE(__vec16_i32, i32, int32_t) SHUFFLES(__vec16_i32, i32, int32_t) @@ -567,6 +596,8 @@ CMP_OP(__vec16_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec16_i64) INSERT_EXTRACT(__vec16_i64, int64_t) SMEAR(__vec16_i64, i64, int64_t) +SETZERO(__vec16_i64, i64) +UNDEF(__vec16_i64, i64) BROADCAST(__vec16_i64, i64, int64_t) ROTATE(__vec16_i64, i64, int64_t) SHUFFLES(__vec16_i64, i64, int64_t) @@ -602,6 +633,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { SELECT(__vec16_f) INSERT_EXTRACT(__vec16_f, float) SMEAR(__vec16_f, float, float) +SETZERO(__vec16_f, float) +UNDEF(__vec16_f, float) BROADCAST(__vec16_f, float, float) ROTATE(__vec16_f, float, float) SHUFFLES(__vec16_f, float, float) @@ -752,6 +785,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { SELECT(__vec16_d) INSERT_EXTRACT(__vec16_d, double) SMEAR(__vec16_d, double, double) +SETZERO(__vec16_d, double) +UNDEF(__vec16_d, double) BROADCAST(__vec16_d, double, double) ROTATE(__vec16_d, double, double) SHUFFLES(__vec16_d, double, double) diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index b593ac87..c50588a1 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -327,13 +327,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \ +static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 32; ++i) \ ret.v[i] = v; \ return ret; \ } +#define SETZERO(VTYPE, NAME) \ +static FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 32; ++i) \ + ret.v[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +static FORCEINLINE VTYPE __undef_##NAME() { \ + return VTYPE(); \ +} + #define BROADCAST(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ VTYPE ret; \ @@ -459,13 +472,24 @@ template static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v *ptr = v.v; } -static FORCEINLINE __vec32_i1 __smear_i1(__vec32_i1, int v) { +static FORCEINLINE __vec32_i1 __smear_i1(int v) { return __vec32_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v); } +static FORCEINLINE __vec32_i1 __setzero_i1() { + return __vec32_i1(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); +} + +static FORCEINLINE __vec32_i1 __undef_i1() { + return __vec32_i1(); +} + /////////////////////////////////////////////////////////////////////////// // int8 @@ -505,6 +529,8 @@ CMP_OP(__vec32_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec32_i8) INSERT_EXTRACT(__vec32_i8, int8_t) SMEAR(__vec32_i8, i8, int8_t) +SETZERO(__vec32_i8, i8) +UNDEF(__vec32_i8, i8) BROADCAST(__vec32_i8, i8, int8_t) ROTATE(__vec32_i8, i8, int8_t) SHUFFLES(__vec32_i8, i8, int8_t) @@ -548,6 +574,8 @@ CMP_OP(__vec32_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec32_i16) INSERT_EXTRACT(__vec32_i16, int16_t) SMEAR(__vec32_i16, i16, int16_t) +SETZERO(__vec32_i16, i16) +UNDEF(__vec32_i16, i16) BROADCAST(__vec32_i16, i16, int16_t) ROTATE(__vec32_i16, i16, int16_t) SHUFFLES(__vec32_i16, i16, int16_t) @@ -591,6 +619,8 @@ CMP_OP(__vec32_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec32_i32) INSERT_EXTRACT(__vec32_i32, int32_t) SMEAR(__vec32_i32, i32, int32_t) +SETZERO(__vec32_i32, i32) +UNDEF(__vec32_i32, i32) BROADCAST(__vec32_i32, i32, int32_t) ROTATE(__vec32_i32, i32, int32_t) SHUFFLES(__vec32_i32, i32, int32_t) @@ -634,6 +664,8 @@ CMP_OP(__vec32_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec32_i64) INSERT_EXTRACT(__vec32_i64, int64_t) SMEAR(__vec32_i64, i64, int64_t) +SETZERO(__vec32_i64, i64) +UNDEF(__vec32_i64, i64) BROADCAST(__vec32_i64, i64, int64_t) ROTATE(__vec32_i64, i64, int64_t) SHUFFLES(__vec32_i64, i64, int64_t) @@ -669,6 +701,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) { SELECT(__vec32_f) INSERT_EXTRACT(__vec32_f, float) SMEAR(__vec32_f, float, float) +SETZERO(__vec32_f, float) +UNDEF(__vec32_f, float) BROADCAST(__vec32_f, float, float) ROTATE(__vec32_f, float, float) SHUFFLES(__vec32_f, float, float) @@ -819,6 +853,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) { SELECT(__vec32_d) INSERT_EXTRACT(__vec32_d, double) SMEAR(__vec32_d, double, double) +SETZERO(__vec32_d, double) +UNDEF(__vec32_d, double) BROADCAST(__vec32_d, double, double) ROTATE(__vec32_d, double, double) SHUFFLES(__vec32_d, double, double) diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index a0456e49..41ac8a3c 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -452,13 +452,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) { \ +static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ ret.v[i] = v; \ return ret; \ } +#define SETZERO(VTYPE, NAME) \ +static FORCEINLINE VTYPE __setzero_##NAME() { \ + VTYPE ret; \ + for (int i = 0; i < 64; ++i) \ + ret.v[i] = 0; \ + return ret; \ +} + +#define UNDEF(VTYPE, NAME) \ +static FORCEINLINE VTYPE __undef_##NAME(VTYPE retType) { \ + return VTYPE(); \ +} + #define BROADCAST(VTYPE, NAME, STYPE) \ static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) { \ VTYPE ret; \ @@ -584,7 +597,7 @@ template static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v *ptr = v.v; } -static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) { +static FORCEINLINE __vec64_i1 __smear_i1(int v) { return __vec64_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, @@ -595,6 +608,21 @@ static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) { v, v, v, v, v, v, v, v); } +static FORCEINLINE __vec64_i1 __setzero_i1() { + return __vec64_i1(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); +} + +static FORCEINLINE __vec64_i1 __undef_i1() { + return __vec64_i1(); +} + /////////////////////////////////////////////////////////////////////////// // int8 @@ -634,6 +662,8 @@ CMP_OP(__vec64_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec64_i8) INSERT_EXTRACT(__vec64_i8, int8_t) SMEAR(__vec64_i8, i8, int8_t) +SETZERO(__vec64_i8, i8) +UNDEF(__vec64_i8, i8) BROADCAST(__vec64_i8, i8, int8_t) ROTATE(__vec64_i8, i8, int8_t) SHUFFLES(__vec64_i8, i8, int8_t) @@ -677,6 +707,8 @@ CMP_OP(__vec64_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec64_i16) INSERT_EXTRACT(__vec64_i16, int16_t) SMEAR(__vec64_i16, i16, int16_t) +SETZERO(__vec64_i16, i16) +UNDEF(__vec64_i16, i16) BROADCAST(__vec64_i16, i16, int16_t) ROTATE(__vec64_i16, i16, int16_t) SHUFFLES(__vec64_i16, i16, int16_t) @@ -720,6 +752,8 @@ CMP_OP(__vec64_i32, i32, int32_t, __signed_greater_than, >) SELECT(__vec64_i32) INSERT_EXTRACT(__vec64_i32, int32_t) SMEAR(__vec64_i32, i32, int32_t) +SETZERO(__vec64_i32, i32) +UNDEF(__vec64_i32, i32) BROADCAST(__vec64_i32, i32, int32_t) ROTATE(__vec64_i32, i32, int32_t) SHUFFLES(__vec64_i32, i32, int32_t) @@ -763,6 +797,8 @@ CMP_OP(__vec64_i64, i64, int64_t, __signed_greater_than, >) SELECT(__vec64_i64) INSERT_EXTRACT(__vec64_i64, int64_t) SMEAR(__vec64_i64, i64, int64_t) +SETZERO(__vec64_i64, i64) +UNDEF(__vec64_i64, i64) BROADCAST(__vec64_i64, i64, int64_t) ROTATE(__vec64_i64, i64, int64_t) SHUFFLES(__vec64_i64, i64, int64_t) @@ -798,6 +834,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) { SELECT(__vec64_f) INSERT_EXTRACT(__vec64_f, float) SMEAR(__vec64_f, float, float) +SETZERO(__vec64_f, float) +UNDEF(__vec64_f, float) BROADCAST(__vec64_f, float, float) ROTATE(__vec64_f, float, float) SHUFFLES(__vec64_f, float, float) @@ -948,6 +986,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) { SELECT(__vec64_d) INSERT_EXTRACT(__vec64_d, double) SMEAR(__vec64_d, double, double) +SETZERO(__vec64_d, double) +UNDEF(__vec64_d, double) BROADCAST(__vec64_d, double, double) ROTATE(__vec64_d, double, double) SHUFFLES(__vec64_d, double, double) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 243dc539..ae60e9ae 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -477,10 +477,18 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *ptr = v.m; } -static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int i) { +static FORCEINLINE __vec16_i1 __smear_i1(int i) { return i?0xFFFF:0x0; } +static FORCEINLINE __vec16_i1 __setzero_i1() { + return 0; +} + +static FORCEINLINE __vec16_i1 __undef_i1() { + return __vec16_i1(); // FIXME? __mm512_undef_mask(); +} + /////////////////////////////////////////////////////////////////////////// // int8 @@ -686,10 +694,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; } static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; } -static FORCEINLINE __vec16_i32 __smear_i32(__vec16_i32, int32_t i) { +static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) { return _mm512_set_1to16_epi32(i); } +static FORCEINLINE __vec16_i32 __setzero_i32() { + return _mm512_setzero_epi32(); +} + +static FORCEINLINE __vec16_i32 __undef_i32() { + return _mm512_undefined_epi32(); +} + static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) { int32_t val = __extract_element(v, index & 0xf); return _mm512_set_1to16_epi32(val); @@ -966,10 +982,18 @@ static FORCEINLINE void __insert_element(__vec16_f *v, int index, float val) { ((float *)v)[index] = val; } -static FORCEINLINE __vec16_f __smear_float(__vec16_f, float f) { +static FORCEINLINE __vec16_f __smear_float(float f) { return _mm512_set_1to16_ps(f); } +static FORCEINLINE __vec16_f __setzero_float() { + return _mm512_setzero_ps(); +} + +static FORCEINLINE __vec16_f __undef_float() { + return _mm512_undefined_ps(); +} + static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) { int32_t val = __extract_element(v, index & 0xf); return _mm512_set_1to16_ps(val); @@ -1116,13 +1140,27 @@ static FORCEINLINE void __insert_element(__vec16_d *v, int index, double val) { ((double *)v)[index] = val; } -static FORCEINLINE __vec16_d __smear_double(__vec16_d, double d) { +static FORCEINLINE __vec16_d __smear_double(double d) { __vec16_d ret; ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); ret.v2 = ret.v1; return ret; } +static FORCEINLINE __vec16_d __setzero_double() { + __vec16_d ret; + ret.v1 = _mm512_setzero_pd(); + ret.v2 = ret.v1; + return ret; +} + +static FORCEINLINE __vec16_d __undef_double() { + __vec16_d ret; + ret.v1 = _mm512_undefined_pd(); + ret.v2 = ret.v1; + return ret; +} + static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) { __vec16_d ret; int32_t val = __extract_element(v, index & 0xf); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 4fb946c3..7585d599 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -297,10 +297,18 @@ template static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 val _mm_storeu_ps((float *)(&p->v), value.v); } -static FORCEINLINE __vec4_i1 __smear_i1(__vec4_i1, int v) { +static FORCEINLINE __vec4_i1 __smear_i1(int v) { return __vec4_i1(v, v, v, v); } +static FORCEINLINE __vec4_i1 __setzero_i1() { + return __vec4_i1(_mm_setzero_ps()); +} + +static FORCEINLINE __vec4_i1 __undef_i1() { + return __vec4_i1(); +} + /////////////////////////////////////////////////////////////////////////// // int8 @@ -525,10 +533,18 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) { ((int8_t *)v)[index] = val; } -static FORCEINLINE __vec4_i8 __smear_i8(__vec4_i8, int8_t v) { +static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) { return _mm_set1_epi8(v); } +static FORCEINLINE __vec4_i8 __setzero_i8() { + return _mm_set1_epi8(0); +} + +static FORCEINLINE __vec4_i8 __undef_i8() { + return __vec4_i8(); +} + static FORCEINLINE __vec4_i8 __broadcast_i8(__vec4_i8 v, int index) { return _mm_set1_epi8(__extract_element(v, index)); } @@ -784,10 +800,18 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) ((int16_t *)v)[index] = val; } -static FORCEINLINE __vec4_i16 __smear_i16(__vec4_i16, int16_t v) { +static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) { return _mm_set1_epi16(v); } +static FORCEINLINE __vec4_i16 __setzero_i16() { + return _mm_set1_epi16(0); +} + +static FORCEINLINE __vec4_i16 __undef_i16() { + return __vec4_i16(); +} + static FORCEINLINE __vec4_i16 __broadcast_i16(__vec4_i16 v, int index) { return _mm_set1_epi16(__extract_element(v, index)); } @@ -1021,10 +1045,18 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 _mm_castsi128_ps(a.v), mask.v)); } -static FORCEINLINE __vec4_i32 __smear_i32(__vec4_i32, int32_t v) { +static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) { return _mm_set1_epi32(v); } +static FORCEINLINE __vec4_i32 __setzero_i32() { + return _mm_castps_si128(_mm_setzero_ps()); +} + +static FORCEINLINE __vec4_i32 __undef_i32() { + return __vec4_i32(); +} + static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) { return ((int32_t *)&v)[index]; } @@ -1282,10 +1314,18 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1)); } -static FORCEINLINE __vec4_i64 __smear_i64(__vec4_i64, int64_t v) { +static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) { return __vec4_i64(v, v, v, v); } +static FORCEINLINE __vec4_i64 __setzero_i64() { + return __vec4_i64(0, 0, 0, 0); +} + +static FORCEINLINE __vec4_i64 __undef_i64() { + return __vec4_i64(); +} + static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) { return ((int64_t *)&v)[index]; } @@ -1386,10 +1426,18 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) { return _mm_blendv_ps(b.v, a.v, mask.v); } -static FORCEINLINE __vec4_f __smear_float(__vec4_f, float v) { +static FORCEINLINE __vec4_f __smear_float(float v) { return _mm_set1_ps(v); } +static FORCEINLINE __vec4_f __setzero_float() { + return _mm_setzero_ps(); +} + +static FORCEINLINE __vec4_f __undef_float() { + return __vec4_f(); +} + static FORCEINLINE float __extract_element(__vec4_f v, int index) { return ((float *)&v)[index]; } @@ -1518,10 +1566,18 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) { return __vec4_d(r0, r1); } -static FORCEINLINE __vec4_d __smear_double(__vec4_d, double v) { +static FORCEINLINE __vec4_d __smear_double(double v) { return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v)); } +static FORCEINLINE __vec4_d __setzero_double() { + return __vec4_d(_mm_setzero_pd(), _mm_setzero_pd()); +} + +static FORCEINLINE __vec4_d __undef_double() { + return __vec4_d(); +} + static FORCEINLINE double __extract_element(__vec4_d v, int index) { return ((double *)&v)[index]; } @@ -1618,13 +1674,11 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) { } static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) { - return __select(v, __smear_i8(__vec4_i8(), 0xff), - __smear_i8(__vec4_i8(), 0)); + return __select(v, __smear_i8(0xff), __setzero_i8()); } static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) { - return __select(v, __smear_i16(__vec4_i16(), 0xffff), - __smear_i16(__vec4_i16(), 0)); + return __select(v, __smear_i16(0xffff), __setzero_i16()); } static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) { @@ -1684,12 +1738,11 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) { } static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) { - return __select(v, __smear_i8(__vec4_i8(), 1), __smear_i8(__vec4_i8(), 0)); + return __select(v, __smear_i8(1), __setzero_i8()); } static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) { - return __select(v, __smear_i16(__vec4_i16(), 1), - __smear_i16(__vec4_i16(), 0)); + return __select(v, __smear_i16(1), __setzero_i16()); } static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) { @@ -1697,7 +1750,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) { } static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) { - return __select(v, __smear_i64(__vec4_i64(), 1), __smear_i64(__vec4_i64(), 0)); + return __select(v, __smear_i64(1), __setzero_i64()); } // truncations @@ -1857,11 +1910,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) { } static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) { - return __select(v, __smear_float(__vec4_f(), 1.), __smear_float(__vec4_f(), 0.)); + return __select(v, __smear_float(1.), __setzero_float()); } static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) { - return __select(v, __smear_double(__vec4_d(), 1.), __smear_double(__vec4_d(), 0.)); + return __select(v, __smear_double(1.), __setzero_double()); } // float/double to signed int @@ -2796,8 +2849,8 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, RetScalar r[4]; #if 1 // "Fast gather" trick... - offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0)); - constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0)); + offsets = __select(mask, offsets, __setzero_i32()); + constOffset = __select(mask, constOffset, __setzero_i32()); int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); @@ -2854,8 +2907,8 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, RetScalar r[4]; #if 1 // "Fast gather" trick... - offsets = __select(mask, offsets, __smear_i64(__vec4_i64(), 0)); - constOffset = __select(mask, constOffset, __smear_i64(__vec4_i64(), 0)); + offsets = __select(mask, offsets, __setzero_i64()); + constOffset = __select(mask, constOffset, __setzero_i64()); int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset);