From e5fe0eabdc082dc004337b9d194ef6f9224ddcd7 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 6 Jul 2012 08:47:47 -0700 Subject: [PATCH 1/2] Update __load() builtins to take const pointers. --- examples/intrinsics/generic-16.h | 4 ++-- examples/intrinsics/generic-32.h | 4 ++-- examples/intrinsics/generic-64.h | 4 ++-- examples/intrinsics/knc.h | 20 ++++++++++---------- examples/intrinsics/sse4.h | 14 +++++++------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 438b4d5f..e4123cc5 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -212,7 +212,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ #define LOAD_STORE(VTYPE, STYPE) \ template \ -static FORCEINLINE VTYPE __load(VTYPE *p) { \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ STYPE *ptr = (STYPE *)p; \ VTYPE ret; \ for (int i = 0; i < 16; ++i) \ @@ -395,7 +395,7 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, vec->v |= (1 << index); } -template static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) { +template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { uint16_t *ptr = (uint16_t *)p; __vec16_i1 r; r.v = *ptr; diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index dc55fb00..271f4f52 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -277,7 +277,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ #define LOAD_STORE(VTYPE, STYPE) \ template \ -static FORCEINLINE VTYPE __load(VTYPE *p) { \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ STYPE *ptr = (STYPE *)p; \ VTYPE ret; \ for (int i = 0; i < 32; ++i) \ @@ -460,7 +460,7 @@ static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index, vec->v |= (1 << index); } -template static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p) { +template static FORCEINLINE __vec32_i1 __load(const __vec32_i1 *p) { uint16_t *ptr = (uint16_t *)p; __vec32_i1 r; r.v = *ptr; diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index f1eb22ae..d1703e38 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -402,7 +402,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \ #define LOAD_STORE(VTYPE, STYPE) \ template \ -static FORCEINLINE VTYPE __load(VTYPE *p) { \ +static FORCEINLINE VTYPE __load(const VTYPE *p) { \ STYPE *ptr = (STYPE *)p; \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ @@ -585,7 +585,7 @@ static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index, vec->v |= (1ull << index); } -template static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p) { +template static FORCEINLINE __vec64_i1 __load(const __vec64_i1 *p) { uint16_t *ptr = (uint16_t *)p; __vec64_i1 r; r.v = *ptr; diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index bcc89bf4..8eda5224 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -465,8 +465,8 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, } */ -template static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) { - uint16_t *ptr = (uint16_t *)p; +template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { + const uint16_t *ptr = (const uint16_t *)p; __vec16_i1 r; r.m = *ptr; return r; @@ -729,14 +729,14 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __ } */ -template static FORCEINLINE __vec16_i32 __load(__vec16_i32 *p) { +template static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) { __vec16_i32 v; v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); return v; } -template <> static FORCEINLINE __vec16_i32 __load<64>(__vec16_i32 *p) { +template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) { return _mm512_load_epi32(p); } @@ -827,7 +827,7 @@ SHUFFLES(__vec16_i64, i64, int64_t) LOAD_STORE(__vec16_i64, int64_t) -template static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) { +template static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) { __m512i v1; __m512i v2; v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); @@ -851,7 +851,7 @@ template static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) { return ret; } -template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) { +template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v2 = _mm512_load_epi32(p); __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); __vec16_i64 ret; @@ -1015,14 +1015,14 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f v0, __vec16_f v1, __vec1 } */ -template static FORCEINLINE __vec16_f __load(__vec16_f *p) { +template static FORCEINLINE __vec16_f __load(const __vec16_f *p) { __vec16_f v; v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE); return v; } -template <> static FORCEINLINE __vec16_f __load<64>(__vec16_f *p) { +template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) { return _mm512_load_ps(p); } @@ -1184,7 +1184,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_d v0, __vec16_d v1, __vec1 } */ -template static FORCEINLINE __vec16_d __load(__vec16_d *p) { +template static FORCEINLINE __vec16_d __load(const __vec16_d *p) { __vec16_d ret; ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE); @@ -1193,7 +1193,7 @@ template static FORCEINLINE __vec16_d __load(__vec16_d *p) { return ret; } -template <> static FORCEINLINE __vec16_d __load<64>(__vec16_d *p) { +template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) { __vec16_d ret; ret.v1 = _mm512_load_pd(p); ret.v2 = _mm512_load_pd(((uint8_t*)p)+64); diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 57a483c6..54eb8bc6 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -287,7 +287,7 @@ static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) { ((int32_t *)v)[index] = val ? -1 : 0; } -template static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v) { +template static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) { // FIXME: handle align of 16... return _mm_loadu_ps((float *)(&v->v)); } @@ -572,7 +572,7 @@ static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1, return __vec4_i8(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v) { +template static FORCEINLINE __vec4_i8 __load(const __vec4_i8 *v) { uint8_t *ptr = (uint8_t *)(&v->v); return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]); } @@ -839,7 +839,7 @@ static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1, return __vec4_i16(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v) { +template static FORCEINLINE __vec4_i16 __load(const __vec4_i16 *v) { uint16_t *ptr = (uint16_t *)(&v->v); return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]); } @@ -1092,7 +1092,7 @@ static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1, return __vec4_i32(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v) { +template static FORCEINLINE __vec4_i32 __load(const __vec4_i32 *v) { // FIXME: handle align of 16... return _mm_loadu_si128((__m128i *)(&v->v)); } @@ -1362,7 +1362,7 @@ static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1, return __vec4_i64(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v) { +template static FORCEINLINE __vec4_i64 __load(const __vec4_i64 *v) { // FIXME: handle align of 16... return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])), _mm_loadu_si128((__m128i *)(&v->v[1]))); @@ -1473,7 +1473,7 @@ static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1, return __vec4_f(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_f __load(__vec4_f *v) { +template static FORCEINLINE __vec4_f __load(const __vec4_f *v) { // FIXME: handle align of 16... return _mm_loadu_ps((float *)(&v->v)); } @@ -1614,7 +1614,7 @@ static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1, return __vec4_d(r[0], r[1], r[2], r[3]); } -template static FORCEINLINE __vec4_d __load(__vec4_d *v) { +template static FORCEINLINE __vec4_d __load(const __vec4_d *v) { // FIXME: handle align of 16... return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])), _mm_loadu_pd((double *)(&v->v[1]))); From 8aa139b6beb1bccb54ec966cd28c01c6f549a127 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Fri, 6 Jul 2012 08:57:09 -0700 Subject: [PATCH 2/2] For C++ output, store constant vector values in local arrays. When we have a constant vector of primitive types, we now generate a definition of a static const array of the individual values. This in turn allows us to emit a simple aligned vector load to get the constant vector value, rather than inefficiently inserting the values into a vector. Issue #318. --- cbackend.cpp | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/cbackend.cpp b/cbackend.cpp index 20a6d210..89287faf 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -227,6 +227,8 @@ namespace { const llvm::TargetData* TD; std::map FPConstantMap; + std::map VectorConstantMap; + unsigned VectorConstantIndex; std::set intrinsicPrototypesAlreadyGenerated; std::set ByValParams; unsigned FPCounter; @@ -253,6 +255,7 @@ namespace { vectorWidth(vecwidth) { initializeLoopInfoPass(*llvm::PassRegistry::getPassRegistry()); FPCounter = 0; + VectorConstantIndex = 0; } virtual const char *getPassName() const { return "C backend"; } @@ -278,6 +281,10 @@ namespace { // Output all floating point constants that cannot be printed accurately. printFloatingPointConstants(F); + // Output all vector constants so they can be accessed with single + // vector loads + printVectorConstants(F); + printFunction(F); return false; } @@ -292,6 +299,7 @@ namespace { delete MRI; delete MOFI; FPConstantMap.clear(); + VectorConstantMap.clear(); ByValParams.clear(); intrinsicPrototypesAlreadyGenerated.clear(); UnnamedStructIDs.clear(); @@ -350,6 +358,7 @@ namespace { void printContainedArrays(llvm::ArrayType *ATy, llvm::SmallPtrSet &); void printFloatingPointConstants(llvm::Function &F); void printFloatingPointConstants(const llvm::Constant *C); + void printVectorConstants(llvm::Function &F); void printFunctionSignature(const llvm::Function *F, bool Prototype); void printFunction(llvm::Function &); @@ -1511,6 +1520,22 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) { printConstant(splatValue, Static); Out << ")"; } + else if (VectorConstantMap.find(CDV) != VectorConstantMap.end()) { + // If we have emitted an static const array with the + // vector's values, just load from it. + unsigned index = VectorConstantMap[CDV]; + int alignment = 4 * std::min(vectorWidth, 16); + + Out << "__load<" << alignment << ">("; + + // Cast the pointer to the array of element values to a + // pointer to the vector type. + Out << "(const "; + printSimpleType(Out, CDV->getType(), true, ""); + Out << " *)"; + + Out << "(VectorConstant" << index << "))"; + } else { printType(Out, CPV->getType()); Out << "("; @@ -2515,6 +2540,47 @@ void CWriter::printFloatingPointConstants(const llvm::Constant *C) { } +// For any vector constants, generate code to declare static const arrays +// with their element values. Doing so allows us to emit aligned vector +// loads to get their values, rather than tediously inserting the +// individual values into the vector. +void CWriter::printVectorConstants(llvm::Function &F) { + // LLVM 3.1 and beyond have a different representation of constant + // vectors than before--here we will only do this for 3.1 and later, as + // the separate code path isn't worth the trouble. This will hurt + // performance with 3.0 builds, though they should still generate + // correct code. +#ifndef LLVM_3_0 + for (llvm::constant_iterator I = constant_begin(&F), E = constant_end(&F); + I != E; ++I) { + const llvm::ConstantDataVector *CDV = llvm::dyn_cast(*I); + if (CDV == NULL) + continue; + + // Don't bother if this is a splat of the same value; a (more + // efficient?) __splat_* call will be generated for these. + if (CDV->getSplatValue() != NULL) + continue; + + // Don't align to anything more than 64 bytes + int alignment = 4 * std::min(vectorWidth, 16); + + Out << "static const "; + printSimpleType(Out, CDV->getElementType(), true, ""); + Out << "__attribute__ ((aligned(" << alignment << "))) "; + Out << "VectorConstant" << VectorConstantIndex << "[] = { "; + for (int i = 0; i < (int)CDV->getNumElements(); ++i) { + printConstant(CDV->getElementAsConstant(i), false); + Out << ", "; + } + Out << " };\n"; + + VectorConstantMap[CDV] = VectorConstantIndex++; + } + Out << "\n"; +#endif // !LLVM_3_0 +} + /// printSymbolTable - Run through symbol table looking for type names. If a /// type name is found, emit its declaration... ///