diff --git a/cbackend.cpp b/cbackend.cpp index 1505dae6..9f6eb7eb 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -523,7 +523,7 @@ static std::string CBEMangle(const std::string &S) { std::string Result; for (unsigned i = 0, e = S.size(); i != e; ++i) - if (isalnum(S[i]) || S[i] == '_') { + if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') { Result += S[i]; } else { Result += '_'; @@ -1115,29 +1115,35 @@ bool CWriter::printCast(unsigned opc, llvm::Type *SrcTy, llvm::Type *DstTy) { /** Construct the name of a function with the given base and returning a - vector of a given type. For example, if base is "foo" and matchType is - i16, this will return the string "__foo_i16". + vector of a given type, of the specified idth. For example, if base + is "foo" and matchType is i32 and width is 16, this will return the + string "__foo_i32<__vec16_i32>". */ static const char * -lGetTypedFunc(const char *base, llvm::Type *matchType) { - char buf[64]; - sprintf(buf, "__%s_", base); +lGetTypedFunc(const char *base, llvm::Type *matchType, int width) { + static const char *ty_desc_str[] = {"f", "d", "i1", "i8", "i16", "i32", "i64"}; + static const char *fn_desc_str[] = {"float", "double", "i1", "i8", "i16", "i32", "i64"}; + enum {DESC_FLOAT, DESC_DOUBLE, DESC_I1, DESC_I8, DESC_I16, DESC_I32, DESC_I64} desc; + switch (matchType->getTypeID()) { - case llvm::Type::FloatTyID: strcat(buf, "float"); break; - case llvm::Type::DoubleTyID: strcat(buf, "double"); break; + case llvm::Type::FloatTyID: desc = DESC_FLOAT; break; + case llvm::Type::DoubleTyID: desc = DESC_DOUBLE; break; case llvm::Type::IntegerTyID: { switch (llvm::cast(matchType)->getBitWidth()) { - case 1: strcat(buf, "i1"); break; - case 8: strcat(buf, "i8"); break; - case 16: strcat(buf, "i16"); break; - case 32: strcat(buf, "i32"); break; - case 64: strcat(buf, "i64"); break; + case 1: desc = DESC_I1; break; + case 8: desc = DESC_I8; break; + case 16: desc = DESC_I16; break; + case 32: desc = DESC_I32; break; + case 64: desc = DESC_I64; break; default: return NULL; } break; } default: return NULL; } + + char buf[64]; + snprintf(buf, 64, "__%s_%s<__vec%d_%s>", base, fn_desc_str[desc], width, ty_desc_str[desc]); return strdup(buf); } @@ -1486,19 +1492,19 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) { if (llvm::isa(CPV)) { // All zeros; call the __setzero_* function. - const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType()); + const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType(), vectorWidth); assert(setZeroFunc != NULL); Out << setZeroFunc << "()"; } else if (llvm::isa(CPV)) { // Undefined value; call __undef_* so that we can potentially pass // this information along.. - const char *undefFunc = lGetTypedFunc("undef", VT->getElementType()); + const char *undefFunc = lGetTypedFunc("undef", VT->getElementType(), vectorWidth); assert(undefFunc != NULL); Out << undefFunc << "()"; } else { - const char *smearFunc = lGetTypedFunc("smear", VT->getElementType()); + const char *smearFunc = lGetTypedFunc("smear", VT->getElementType(), vectorWidth); if (llvm::ConstantVector *CV = llvm::dyn_cast(CPV)) { llvm::Constant *splatValue = CV->getSplatValue(); @@ -1713,8 +1719,8 @@ void CWriter::printConstantWithCast(llvm::Constant* CPV, unsigned Opcode) { std::string CWriter::GetValueName(const llvm::Value *Operand) { // Resolve potential alias. - if (const llvm::GlobalAlias *GA = llvm::dyn_cast(Operand)) { - if (const llvm::Value *V = GA->resolveAliasedGlobal(false)) + if (const llvm::GlobalAlias *GA = llvm::dyn_cast(Operand)) { + if (const llvm::Value *V = GA->resolveAliasedGlobal(false)) Operand = V; } @@ -4362,12 +4368,11 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) { { llvm::Type *matchType = toMatch->getType(); - const char *smearFuncName = lGetTypedFunc("smear", matchType); - + const char *smearFuncName = lGetTypedFunc("smear", matchType, vectorWidth); if (smearFuncName != NULL) { llvm::Function *smearFunc = module->getFunction(smearFuncName); if (smearFunc == NULL) { - // Declare the smar function if needed; it takes a single + // Declare the smear function if needed; it takes a single // scalar parameter and returns a vector of the same // parameter type. llvm::Constant *sf = diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 42978701..c4bff793 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -271,7 +271,8 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ +template VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 16; ++i) \ ret.v[i] = v; \ @@ -279,7 +280,8 @@ static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ } #define SETZERO(VTYPE, NAME) \ -static FORCEINLINE VTYPE __setzero_##NAME() { \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ VTYPE ret; \ for (int i = 0; i < 16; ++i) \ ret.v[i] = 0; \ @@ -287,7 +289,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() { \ } #define UNDEF(VTYPE, NAME) \ -static FORCEINLINE VTYPE __undef_##NAME() { \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ return VTYPE(); \ } @@ -416,18 +419,20 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *ptr = v.v; } -static FORCEINLINE __vec16_i1 __smear_i1(int v) { +template __vec16_i1 __smear_i1(int i); +template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) { return __vec16_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v); } -static FORCEINLINE __vec16_i1 __setzero_i1() { +template __vec16_i1 __setzero_i1(); +template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } - -static FORCEINLINE __vec16_i1 __undef_i1() { +template __vec16_i1 __undef_i1(); +template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 94946f4a..ffd5b478 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -336,15 +336,17 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ +template VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 32; ++i) \ ret.v[i] = v; \ return ret; \ } -#define SETZERO(VTYPE, NAME) \ -static FORCEINLINE VTYPE __setzero_##NAME() { \ +#define SETZERO(VTYPE, NAME) \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ VTYPE ret; \ for (int i = 0; i < 32; ++i) \ ret.v[i] = 0; \ @@ -352,7 +354,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() { \ } #define UNDEF(VTYPE, NAME) \ -static FORCEINLINE VTYPE __undef_##NAME() { \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ return VTYPE(); \ } @@ -481,21 +484,24 @@ template static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v *ptr = v.v; } -static FORCEINLINE __vec32_i1 __smear_i1(int v) { +template __vec32_i1 __smear_i1(int i); +template <> FORCEINLINE __vec32_i1 __smear_i1<__vec32_i1>(int v) { return __vec32_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v); } -static FORCEINLINE __vec32_i1 __setzero_i1() { +template __vec32_i1 __setzero_i1(); +template <> FORCEINLINE __vec32_i1 __setzero_i1<__vec32_i1>() { return __vec32_i1(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static FORCEINLINE __vec32_i1 __undef_i1() { +template __vec32_i1 __undef_i1(); +template <> FORCEINLINE __vec32_i1 __undef_i1<__vec32_i1>() { return __vec32_i1(); } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index ff84fee3..a33e1d15 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -461,7 +461,8 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ } #define SMEAR(VTYPE, NAME, STYPE) \ -static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ +template VTYPE __smear_##NAME(STYPE); \ +template <> FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ ret.v[i] = v; \ @@ -469,7 +470,8 @@ static FORCEINLINE VTYPE __smear_##NAME(STYPE v) { \ } #define SETZERO(VTYPE, NAME) \ -static FORCEINLINE VTYPE __setzero_##NAME() { \ +template VTYPE __setzero_##NAME(); \ +template <> FORCEINLINE VTYPE __setzero_##NAME() { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ ret.v[i] = 0; \ @@ -477,7 +479,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() { \ } #define UNDEF(VTYPE, NAME) \ -static FORCEINLINE VTYPE __undef_##NAME(VTYPE retType) { \ +template VTYPE __undef_##NAME(); \ +template <> FORCEINLINE VTYPE __undef_##NAME() { \ return VTYPE(); \ } @@ -606,7 +609,8 @@ template static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v *ptr = v.v; } -static FORCEINLINE __vec64_i1 __smear_i1(int v) { +template __vec64_i1 __smear_i1(int i); +template <> FORCEINLINE __vec64_i1 __smear_i1<__vec64_i1>(int v) { return __vec64_i1(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, @@ -617,7 +621,8 @@ static FORCEINLINE __vec64_i1 __smear_i1(int v) { v, v, v, v, v, v, v, v); } -static FORCEINLINE __vec64_i1 __setzero_i1() { +template __vec64_i1 __setzero_i1(); +template <> FORCEINLINE __vec64_i1 __setzero_i1<__vec64_i1>() { return __vec64_i1(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -628,7 +633,8 @@ static FORCEINLINE __vec64_i1 __setzero_i1() { 0, 0, 0, 0, 0, 0, 0, 0); } -static FORCEINLINE __vec64_i1 __undef_i1() { +template __vec64_i1 __undef_i1(); +template <> FORCEINLINE __vec64_i1 __undef_i1<__vec64_i1>() { return __vec64_i1(); } diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 404cd24f..6abffe74 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -477,15 +477,18 @@ template static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v *ptr = v.m; } -static FORCEINLINE __vec16_i1 __smear_i1(int i) { +template __vec16_i1 __smear_i1(int i); +template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) { return i?0xFFFF:0x0; } -static FORCEINLINE __vec16_i1 __setzero_i1() { +template __vec16_i1 __setzero_i1(); +template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() { return 0; } -static FORCEINLINE __vec16_i1 __undef_i1() { +template __vec16_i1 __undef_i1(); +template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() { return __vec16_i1(); // FIXME? __mm512_undef_mask(); } @@ -744,15 +747,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; } static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; } -static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) { +template __vec16_i32 __smear_i32(int32_t i); +template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) { return _mm512_set_1to16_epi32(i); } -static FORCEINLINE __vec16_i32 __setzero_i32() { +template __vec16_i32 __setzero_i32(); +template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() { return _mm512_setzero_epi32(); } -static FORCEINLINE __vec16_i32 __undef_i32() { +template __vec16_i32 __undef_i32(); +template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() { return _mm512_undefined_epi32(); } @@ -803,15 +809,24 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { // int64 -static FORCEINLINE __vec16_i64 __setzero_i64() { +template __vec16_i64 __setzero_i64(); +template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() { __vec16_i64 ret; ret.v_lo = _mm512_setzero_epi32(); ret.v_hi = _mm512_setzero_epi32(); return ret; } +template __vec16_i64 __undef_i64(); +template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() { + __vec16_i64 ret; + ret.v_lo = _mm512_undefined_epi32(); + ret.v_hi = _mm512_undefined_epi32(); + return ret; +} + static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) -{ +{ __mmask16 carry = 0; __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry); __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry); @@ -885,7 +900,8 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) return src[index+16] | (int64_t(src[index]) << 32); } -static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) { +template __vec16_i64 __smear_i64(const int64_t &l); +template <> FORCEINLINE __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) { const int *i = (const int*)&l; return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); } @@ -897,12 +913,12 @@ LOAD_STORE(__vec16_i64, int64_t) template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { - __m512i v1; - __m512i v2; - v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + __vec16_i32 v1 = _mm512_undefined_epi32(); + __vec16_i32 v2 = _mm512_undefined_epi32(); + v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpackhi_epi32(v1, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); + v1 = _mm512_extloadunpacklo_epi32(v1, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, @@ -1078,15 +1094,18 @@ static FORCEINLINE void __insert_element(__vec16_f *v, int index, float val) { ((float *)v)[index] = val; } -static FORCEINLINE __vec16_f __smear_float(float f) { +template __vec16_f __smear_float(float f); +template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) { return _mm512_set_1to16_ps(f); } -static FORCEINLINE __vec16_f __setzero_float() { +template __vec16_f __setzero_float(); +template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() { return _mm512_setzero_ps(); } -static FORCEINLINE __vec16_f __undef_float() { +template __vec16_f __undef_float(); +template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() { return _mm512_undefined_ps(); } @@ -1287,21 +1306,24 @@ static FORCEINLINE void __insert_element(__vec16_d *v, int index, double val) { ((double *)v)[index] = val; } -static FORCEINLINE __vec16_d __smear_double(double d) { +template __vec16_d __smear_double(double d); +template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) { __vec16_d ret; ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); ret.v2 = ret.v1; return ret; } -static FORCEINLINE __vec16_d __setzero_double() { +template __vec16_d __setzero_double(); +template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() { __vec16_d ret; ret.v1 = _mm512_setzero_pd(); ret.v2 = ret.v1; return ret; } -static FORCEINLINE __vec16_d __undef_double() { +template __vec16_d __undef_double(); +template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() { __vec16_d ret; ret.v1 = _mm512_undefined_pd(); ret.v2 = ret.v1; diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 17ab8f18..b894cb29 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -322,15 +322,18 @@ template static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 val _mm_storeu_ps((float *)(&p->v), value.v); } -static FORCEINLINE __vec4_i1 __smear_i1(int v) { +template __vec4_i1 __smear_i1(int v); +template <> FORCEINLINE __vec4_i1 __smear_i1<__vec4_i1>(int v) { return __vec4_i1(v, v, v, v); } -static FORCEINLINE __vec4_i1 __setzero_i1() { +template __vec4_i1 __setzero_i1(); +template <> FORCEINLINE __vec4_i1 __setzero_i1<__vec4_i1>() { return __vec4_i1(_mm_setzero_ps()); } -static FORCEINLINE __vec4_i1 __undef_i1() { +template __vec4_i1 __undef_i1(); +template <> FORCEINLINE __vec4_i1 __undef_i1<__vec4_i1>() { return __vec4_i1(); } @@ -560,15 +563,18 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) { ((int8_t *)v)[index] = val; } -static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) { +template __vec4_i8 __smear_i8(int8_t v); +template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) { return _mm_set1_epi8(v); } -static FORCEINLINE __vec4_i8 __setzero_i8() { +template __vec4_i8 __setzero_i8(); +template <> FORCEINLINE __vec4_i8 __setzero_i8<__vec4_i8>() { return _mm_set1_epi8(0); } -static FORCEINLINE __vec4_i8 __undef_i8() { +template __vec4_i8 __undef_i8(); +template <> FORCEINLINE __vec4_i8 __undef_i8<__vec4_i8>() { return __vec4_i8(); } @@ -829,15 +835,18 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) ((int16_t *)v)[index] = val; } -static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) { +template __vec4_i16 __smear_i16(int16_t v); +template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) { return _mm_set1_epi16(v); } -static FORCEINLINE __vec4_i16 __setzero_i16() { +template __vec4_i16 __setzero_i16(); +template <> FORCEINLINE __vec4_i16 __setzero_i16<__vec4_i16>() { return _mm_set1_epi16(0); } -static FORCEINLINE __vec4_i16 __undef_i16() { +template __vec4_i16 __undef_i16(); +template <> FORCEINLINE __vec4_i16 __undef_i16<__vec4_i16>() { return __vec4_i16(); } @@ -1076,15 +1085,18 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32 _mm_castsi128_ps(a.v), mask.v)); } -static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) { +template __vec4_i32 __smear_i32(int32_t v); +template <> FORCEINLINE __vec4_i32 __smear_i32<__vec4_i32>(int32_t v) { return _mm_set1_epi32(v); } -static FORCEINLINE __vec4_i32 __setzero_i32() { +template __vec4_i32 __setzero_i32(); +template <> FORCEINLINE __vec4_i32 __setzero_i32<__vec4_i32>() { return _mm_castps_si128(_mm_setzero_ps()); } -static FORCEINLINE __vec4_i32 __undef_i32() { +template __vec4_i32 __undef_i32(); +template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() { return __vec4_i32(); } @@ -1347,15 +1359,18 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64 return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1)); } -static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) { +template __vec4_i64 __smear_i64(int64_t v); +template <> FORCEINLINE __vec4_i64 __smear_i64<__vec4_i64>(int64_t v) { return __vec4_i64(v, v, v, v); } -static FORCEINLINE __vec4_i64 __setzero_i64() { +template __vec4_i64 __setzero_i64(); +template <> FORCEINLINE __vec4_i64 __setzero_i64<__vec4_i64>() { return __vec4_i64(0, 0, 0, 0); } -static FORCEINLINE __vec4_i64 __undef_i64() { +template __vec4_i64 __undef_i64(); +template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() { return __vec4_i64(); } @@ -1465,15 +1480,18 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) { return _mm_blendv_ps(b.v, a.v, mask.v); } -static FORCEINLINE __vec4_f __smear_float(float v) { +template __vec4_f __smear_float(float v); +template <> FORCEINLINE __vec4_f __smear_float<__vec4_f>(float v) { return _mm_set1_ps(v); } -static FORCEINLINE __vec4_f __setzero_float() { +template __vec4_f __setzero_float(); +template <> FORCEINLINE __vec4_f __setzero_float<__vec4_f>() { return _mm_setzero_ps(); } -static FORCEINLINE __vec4_f __undef_float() { +template __vec4_f __undef_float(); +template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() { return __vec4_f(); } @@ -1614,15 +1632,18 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) { return __vec4_d(r0, r1); } -static FORCEINLINE __vec4_d __smear_double(double v) { +template __vec4_d __smear_double(double v); +template <> FORCEINLINE __vec4_d __smear_double<__vec4_d>(double v) { return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v)); } -static FORCEINLINE __vec4_d __setzero_double() { +template __vec4_d __setzero_double(); +template <> FORCEINLINE __vec4_d __setzero_double<__vec4_d>() { return __vec4_d(_mm_setzero_pd(), _mm_setzero_pd()); } -static FORCEINLINE __vec4_d __undef_double() { +template __vec4_d __undef_double(); +template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() { return __vec4_d(); } @@ -1722,11 +1743,11 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) { } static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) { - return __select(v, __smear_i8(0xff), __setzero_i8()); + return __select(v, __smear_i8<__vec4_i8>(0xff), __setzero_i8<__vec4_i8>()); } static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) { - return __select(v, __smear_i16(0xffff), __setzero_i16()); + return __select(v, __smear_i16<__vec4_i16>(0xffff), __setzero_i16<__vec4_i16>()); } static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) { @@ -1786,11 +1807,11 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) { } static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) { - return __select(v, __smear_i8(1), __setzero_i8()); + return __select(v, __smear_i8<__vec4_i8>(1), __setzero_i8<__vec4_i8>()); } static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) { - return __select(v, __smear_i16(1), __setzero_i16()); + return __select(v, __smear_i16<__vec4_i16>(1), __setzero_i16<__vec4_i16>()); } static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) { @@ -1798,7 +1819,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) { } static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) { - return __select(v, __smear_i64(1), __setzero_i64()); + return __select(v, __smear_i64<__vec4_i64>(1), __setzero_i64<__vec4_i64>()); } // truncations @@ -1958,11 +1979,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) { } static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) { - return __select(v, __smear_float(1.), __setzero_float()); + return __select(v, __smear_float<__vec4_f>(1.), __setzero_float<__vec4_f>()); } static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) { - return __select(v, __smear_double(1.), __setzero_double()); + return __select(v, __smear_double<__vec4_d>(1.), __setzero_double<__vec4_d>()); } // float/double to signed int @@ -2897,7 +2918,7 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, RetScalar r[4]; #if 1 // "Fast gather" trick... - offsets = __select(mask, offsets, __setzero_i32()); + offsets = __select(mask, offsets, __setzero_i32<__vec4_i32>()); int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); @@ -2954,7 +2975,7 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, RetScalar r[4]; #if 1 // "Fast gather" trick... - offsets = __select(mask, offsets, __setzero_i64()); + offsets = __select(mask, offsets, __setzero_i64<__vec4_i64>()); int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); diff --git a/stmt.cpp b/stmt.cpp index 04807faf..4aaac257 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -2207,7 +2207,7 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const { // lane's value of the varying expression is the same as the value // we've selected to process this time through--i.e.: // oldMask & (smear(element) == exprValue) - llvm::Value *uniqueSmear = ctx->SmearUniform(uniqueValue, "unique_semar"); + llvm::Value *uniqueSmear = ctx->SmearUniform(uniqueValue, "unique_smear"); llvm::Value *matchingLanes = NULL; if (uniqueValue->getType()->isFloatingPointTy()) matchingLanes = diff --git a/type.cpp b/type.cpp index 272de7a4..942ff45c 100644 --- a/type.cpp +++ b/type.cpp @@ -1060,12 +1060,12 @@ PointerType::Mangle() const { return ""; } - std::string ret = variability.MangleString() + std::string("<"); + std::string ret = variability.MangleString() + std::string("_3C_"); // < if (isSlice || isFrozen) ret += "-"; if (isSlice) ret += "s"; if (isFrozen) ret += "f"; if (isSlice || isFrozen) ret += "-"; - return ret + baseType->Mangle() + std::string(">"); + return ret + baseType->Mangle() + std::string("_3E_"); // > } @@ -1636,7 +1636,7 @@ std::string VectorType::Mangle() const { std::string s = base->Mangle(); char buf[16]; - sprintf(buf, "<%d>", numElements); + sprintf(buf, "_3C_%d_3E_", numElements); // "<%d>" return s + std::string(buf); } @@ -1789,7 +1789,7 @@ lMangleStructName(const std::string &name, Variability variability) { n += buf; break; default: - FATAL("Unexpected varaibility in lMangleStructName()"); + FATAL("Unexpected variability in lMangleStructName()"); } // And stuff the name at the end.... @@ -2049,7 +2049,7 @@ std::string StructType::Mangle() const { return lMangleStruct(variability, isConst, name); } - + std::string StructType::GetCDeclaration(const std::string &n) const {