From e5fe0eabdc082dc004337b9d194ef6f9224ddcd7 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 6 Jul 2012 08:47:47 -0700
Subject: [PATCH 1/2] Update __load() builtins to take const pointers.

---
 examples/intrinsics/generic-16.h |  4 ++--
 examples/intrinsics/generic-32.h |  4 ++--
 examples/intrinsics/generic-64.h |  4 ++--
 examples/intrinsics/knc.h        | 20 ++++++++++----------
 examples/intrinsics/sse4.h       | 14 +++++++-------
 5 files changed, 23 insertions(+), 23 deletions(-)
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 438b4d5f..e4123cc5 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -212,7 +212,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
 
 #define LOAD_STORE(VTYPE, STYPE)                       \
 template <int ALIGN>                                   \
-static FORCEINLINE VTYPE __load(VTYPE *p) {            \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
     STYPE *ptr = (STYPE *)p;                           \
     VTYPE ret;                                         \
     for (int i = 0; i < 16; ++i)                       \
@@ -395,7 +395,7 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
         vec->v |= (1 << index);
 }
 
-template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
     uint16_t *ptr = (uint16_t *)p;
     __vec16_i1 r;
     r.v = *ptr;
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index dc55fb00..271f4f52 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -277,7 +277,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
 
 #define LOAD_STORE(VTYPE, STYPE)                       \
 template <int ALIGN>                                   \
-static FORCEINLINE VTYPE __load(VTYPE *p) {            \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
     STYPE *ptr = (STYPE *)p;                           \
     VTYPE ret;                                         \
     for (int i = 0; i < 32; ++i)                       \
@@ -460,7 +460,7 @@ static FORCEINLINE void __insert_element(__vec32_i1 *vec, int index,
         vec->v |= (1 << index);
 }
 
-template <int ALIGN> static FORCEINLINE __vec32_i1 __load(__vec32_i1 *p) {
+template <int ALIGN> static FORCEINLINE __vec32_i1 __load(const __vec32_i1 *p) {
     uint16_t *ptr = (uint16_t *)p;
     __vec32_i1 r;
     r.v = *ptr;
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index f1eb22ae..d1703e38 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -402,7 +402,7 @@ static FORCEINLINE void __insert_element(VTYPE *v, int index, STYPE val) { \
 
 #define LOAD_STORE(VTYPE, STYPE)                       \
 template <int ALIGN>                                   \
-static FORCEINLINE VTYPE __load(VTYPE *p) {            \
+static FORCEINLINE VTYPE __load(const VTYPE *p) {      \
     STYPE *ptr = (STYPE *)p;                           \
     VTYPE ret;                                         \
     for (int i = 0; i < 64; ++i)                       \
@@ -585,7 +585,7 @@ static FORCEINLINE void __insert_element(__vec64_i1 *vec, int index,
         vec->v |= (1ull << index);
 }
 
-template <int ALIGN> static FORCEINLINE __vec64_i1 __load(__vec64_i1 *p) {
+template <int ALIGN> static FORCEINLINE __vec64_i1 __load(const __vec64_i1 *p) {
     uint16_t *ptr = (uint16_t *)p;
     __vec64_i1 r;
     r.v = *ptr;
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index bcc89bf4..8eda5224 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -465,8 +465,8 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
 }
 */
 
-template <int ALIGN> static FORCEINLINE __vec16_i1 __load(__vec16_i1 *p) {
-    uint16_t *ptr = (uint16_t *)p;
+template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
+    const uint16_t *ptr = (const uint16_t *)p;
     __vec16_i1 r;
     r.m = *ptr;
     return r;
@@ -729,14 +729,14 @@ static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __
 }
 */
 
-template <int ALIGN> static FORCEINLINE __vec16_i32 __load(__vec16_i32 *p) {
+template <int ALIGN> static FORCEINLINE __vec16_i32 __load(const __vec16_i32 *p) {
     __vec16_i32 v;
     v = _mm512_extloadunpackhi_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
     v = _mm512_extloadunpacklo_epi32(v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
     return v;
 }
 
-template <> static FORCEINLINE __vec16_i32 __load<64>(__vec16_i32 *p) {
+template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) {
     return _mm512_load_epi32(p);
 }
 
@@ -827,7 +827,7 @@ SHUFFLES(__vec16_i64, i64, int64_t)
 LOAD_STORE(__vec16_i64, int64_t)
 
 
-template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
+template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
     __m512i v1;
     __m512i v2;
     v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
@@ -851,7 +851,7 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
     return ret;    
 }
 
-template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
+template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) {
     __m512i v2 = _mm512_load_epi32(p);
     __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
     __vec16_i64 ret;
@@ -1015,14 +1015,14 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_f v0, __vec16_f v1, __vec1
 }
 */
 
-template <int ALIGN> static FORCEINLINE __vec16_f __load(__vec16_f *p) {
+template <int ALIGN> static FORCEINLINE __vec16_f __load(const __vec16_f *p) {
     __vec16_f v;
     v = _mm512_extloadunpackhi_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
     v = _mm512_extloadunpacklo_ps(v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
     return v;
 }
 
-template <> static FORCEINLINE __vec16_f __load<64>(__vec16_f *p) {
+template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) {
     return _mm512_load_ps(p);
 }
 
@@ -1184,7 +1184,7 @@ static FORCEINLINE __vec16_f __shuffle2_float(__vec16_d v0, __vec16_d v1, __vec1
 }
 */
 
-template <int ALIGN> static FORCEINLINE __vec16_d __load(__vec16_d *p) {
+template <int ALIGN> static FORCEINLINE __vec16_d __load(const __vec16_d *p) {
     __vec16_d ret;
     ret.v1 = _mm512_extloadunpackhi_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
     ret.v1 = _mm512_extloadunpacklo_pd(ret.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
@@ -1193,7 +1193,7 @@ template <int ALIGN> static FORCEINLINE __vec16_d __load(__vec16_d *p) {
     return ret;
 }
 
-template <> static FORCEINLINE __vec16_d __load<64>(__vec16_d *p) {
+template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) {
     __vec16_d ret;
     ret.v1 = _mm512_load_pd(p);
     ret.v2 = _mm512_load_pd(((uint8_t*)p)+64);
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 57a483c6..54eb8bc6 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -287,7 +287,7 @@ static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
     ((int32_t *)v)[index] = val ? -1 : 0;
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_i1 __load(__vec4_i1 *v) {
+template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
     // FIXME: handle align of 16...
     return _mm_loadu_ps((float *)(&v->v));
 }
@@ -572,7 +572,7 @@ static FORCEINLINE __vec4_i8 __shuffle2_i8(__vec4_i8 v0, __vec4_i8 v1,
     return __vec4_i8(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_i8 __load(__vec4_i8 *v) {
+template <int ALIGN> static FORCEINLINE __vec4_i8 __load(const __vec4_i8 *v) {
     uint8_t *ptr = (uint8_t *)(&v->v);
     return __vec4_i8(ptr[0], ptr[1], ptr[2], ptr[3]);
 }
@@ -839,7 +839,7 @@ static FORCEINLINE __vec4_i16 __shuffle2_i16(__vec4_i16 v0, __vec4_i16 v1,
     return __vec4_i16(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_i16 __load(__vec4_i16 *v) {
+template <int ALIGN> static FORCEINLINE __vec4_i16 __load(const __vec4_i16 *v) {
     uint16_t *ptr = (uint16_t *)(&v->v);
     return __vec4_i16(ptr[0], ptr[1], ptr[2], ptr[3]);
 }
@@ -1092,7 +1092,7 @@ static FORCEINLINE __vec4_i32 __shuffle2_i32(__vec4_i32 v0, __vec4_i32 v1,
     return __vec4_i32(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_i32 __load(__vec4_i32 *v) {
+template <int ALIGN> static FORCEINLINE __vec4_i32 __load(const __vec4_i32 *v) {
     // FIXME: handle align of 16...
     return _mm_loadu_si128((__m128i *)(&v->v));
 }
@@ -1362,7 +1362,7 @@ static FORCEINLINE __vec4_i64 __shuffle2_i64(__vec4_i64 v0, __vec4_i64 v1,
     return __vec4_i64(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_i64 __load(__vec4_i64 *v) {
+template <int ALIGN> static FORCEINLINE __vec4_i64 __load(const __vec4_i64 *v) {
     // FIXME: handle align of 16...
     return __vec4_i64(_mm_loadu_si128((__m128i *)(&v->v[0])),
                       _mm_loadu_si128((__m128i *)(&v->v[1])));
@@ -1473,7 +1473,7 @@ static FORCEINLINE __vec4_f __shuffle2_float(__vec4_f v0, __vec4_f v1,
     return __vec4_f(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_f __load(__vec4_f *v) {
+template <int ALIGN> static FORCEINLINE __vec4_f __load(const __vec4_f *v) {
     // FIXME: handle align of 16...
     return _mm_loadu_ps((float *)(&v->v));
 }
@@ -1614,7 +1614,7 @@ static FORCEINLINE __vec4_d __shuffle2_double(__vec4_d v0, __vec4_d v1,
     return __vec4_d(r[0], r[1], r[2], r[3]);
 }
 
-template <int ALIGN> static FORCEINLINE __vec4_d __load(__vec4_d *v) {
+template <int ALIGN> static FORCEINLINE __vec4_d __load(const __vec4_d *v) {
     // FIXME: handle align of 16...
     return __vec4_d(_mm_loadu_pd((double *)(&v->v[0])),
                     _mm_loadu_pd((double *)(&v->v[1])));

From 8aa139b6beb1bccb54ec966cd28c01c6f549a127 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 6 Jul 2012 08:57:09 -0700
Subject: [PATCH 2/2] For C++ output, store constant vector values in local
 arrays.

When we have a constant vector of primitive types, we now generate
a definition of a static const array of the individual values.  This
in turn allows us to emit a simple aligned vector load to get the
constant vector value, rather than inefficiently inserting the values
into a vector.

Issue #318.
---
 cbackend.cpp | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/cbackend.cpp b/cbackend.cpp
index 20a6d210..89287faf 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -227,6 +227,8 @@ namespace {
     const llvm::TargetData* TD;
     
     std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
+    std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
+    unsigned VectorConstantIndex;
     std::set<llvm::Function*> intrinsicPrototypesAlreadyGenerated;
     std::set<const llvm::Argument*> ByValParams;
     unsigned FPCounter;
@@ -253,6 +255,7 @@ namespace {
         vectorWidth(vecwidth) {
       initializeLoopInfoPass(*llvm::PassRegistry::getPassRegistry());
       FPCounter = 0;
+      VectorConstantIndex = 0;
     }
 
     virtual const char *getPassName() const { return "C backend"; }
@@ -278,6 +281,10 @@ namespace {
       // Output all floating point constants that cannot be printed accurately.
       printFloatingPointConstants(F);
 
+      // Output all vector constants so they can be accessed with single
+      // vector loads
+      printVectorConstants(F);
+
       printFunction(F);
       return false;
     }
@@ -292,6 +299,7 @@ namespace {
       delete MRI;
       delete MOFI;
       FPConstantMap.clear();
+      VectorConstantMap.clear();
       ByValParams.clear();
       intrinsicPrototypesAlreadyGenerated.clear();
       UnnamedStructIDs.clear();
@@ -350,6 +358,7 @@ namespace {
     void printContainedArrays(llvm::ArrayType *ATy, llvm::SmallPtrSet<llvm::Type *, 16> &);
     void printFloatingPointConstants(llvm::Function &F);
     void printFloatingPointConstants(const llvm::Constant *C);
+    void printVectorConstants(llvm::Function &F);
     void printFunctionSignature(const llvm::Function *F, bool Prototype);
 
     void printFunction(llvm::Function &);
@@ -1511,6 +1520,22 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
                 printConstant(splatValue, Static);
                 Out << ")";
             }
+            else if (VectorConstantMap.find(CDV) != VectorConstantMap.end()) {
+                // If we have emitted an static const array with the
+                // vector's values, just load from it.
+                unsigned index = VectorConstantMap[CDV];
+                int alignment = 4 * std::min(vectorWidth, 16);
+
+                Out << "__load<" << alignment << ">(";
+
+                // Cast the pointer to the array of element values to a
+                // pointer to the vector type.
+                Out << "(const ";
+                printSimpleType(Out, CDV->getType(), true, "");
+                Out << " *)";
+
+                Out << "(VectorConstant" << index << "))";
+            }
             else {
                 printType(Out, CPV->getType());
                 Out << "(";
@@ -2515,6 +2540,47 @@ void CWriter::printFloatingPointConstants(const llvm::Constant *C) {
 }
 
 
+// For any vector constants, generate code to declare static const arrays
+// with their element values.  Doing so allows us to emit aligned vector
+// loads to get their values, rather than tediously inserting the
+// individual values into the vector.
+void CWriter::printVectorConstants(llvm::Function &F) {
+    // LLVM 3.1 and beyond have a different representation of constant
+    // vectors than before--here we will only do this for 3.1 and later, as
+    // the separate code path isn't worth the trouble.  This will hurt
+    // performance with 3.0 builds, though they should still generate
+    // correct code.
+#ifndef LLVM_3_0
+    for (llvm::constant_iterator I = constant_begin(&F), E = constant_end(&F);
+         I != E; ++I) {
+        const llvm::ConstantDataVector *CDV = llvm::dyn_cast<llvm::ConstantDataVector>(*I);
+        if (CDV == NULL)
+            continue;
+
+        // Don't bother if this is a splat of the same value; a (more
+        // efficient?) __splat_* call will be generated for these.
+        if (CDV->getSplatValue() != NULL)
+            continue;
+
+        // Don't align to anything more than 64 bytes
+        int alignment = 4 * std::min(vectorWidth, 16);
+
+        Out << "static const ";
+        printSimpleType(Out, CDV->getElementType(), true, "");
+        Out << "__attribute__ ((aligned(" << alignment << "))) ";
+        Out << "VectorConstant" << VectorConstantIndex << "[] = { ";
+        for (int i = 0; i < (int)CDV->getNumElements(); ++i) {
+            printConstant(CDV->getElementAsConstant(i), false);
+            Out << ", ";
+        }
+        Out << " };\n";
+
+        VectorConstantMap[CDV] = VectorConstantIndex++;
+    }
+    Out << "\n";
+#endif // !LLVM_3_0
+}
+
 /// printSymbolTable - Run through symbol table looking for type names.  If a
 /// type name is found, emit its declaration...
 ///