From b9d1f0db18ab54a677b4e207f45d7a4d8c455708 Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jean-luc.duprat@intel.com>
Date: Thu, 5 Jul 2012 16:56:13 -0700
Subject: [PATCH 1/2] Ongoing support for int64 for KNC: Fixes to __load and
 __store. Added __add, __mul, __equal, __not_equal, __extract_elements,
 __smear_i64, __cast_sext, __cast_zext, and __scatter_base_offsets32_float.

__rcp_varying_float now has a fast-math and full-precision implementation.
---
 examples/intrinsics/knc.h | 140 ++++++++++++++++++++++++++++----------
 1 file changed, 103 insertions(+), 37 deletions(-)
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 622f10e0..9cc6ef22 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -34,6 +34,7 @@
 #include <stdint.h>
 #include <math.h>
 #include <assert.h>
+#include <unistd.h>
 
 #include <immintrin.h>
 #include <zmmintrin.h>
@@ -52,7 +53,13 @@
 
 #define KNC 1
 extern "C" {
-  int printf(const unsigned char *, ...);
+    int printf(const unsigned char *, ...);
+    int puts(unsigned char *);
+    unsigned int putchar(unsigned int);
+    int fflush(void *);
+    uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );
+    uint8_t *memset(uint8_t *, uint8_t, uint64_t );
+    void memset_pattern16(void *, const void *, uint64_t );
 }
 
 
@@ -130,9 +137,10 @@ typedef struct PRE_ALIGN(64) __vec16_d {
 
 typedef struct PRE_ALIGN(64) __vec16_i32 {
     operator __m512i() const { return v; }
-    __vec16_i32() { }
-    __vec16_i32(const __m512i& in) { v = in; }
-    __vec16_i32(const __vec16_i32& in) { v = in.v; }
+    __vec16_i32() {}
+    __vec16_i32(const int32_t &in) : v(_mm512_set_1to16_epi32(in)) {};
+    __vec16_i32(const __m512i &in) { v = in; }
+    __vec16_i32(const __vec16_i32 &in) { v = in.v; }
     __vec16_i32(int32_t v00, int32_t v01, int32_t v02, int32_t v03, 
                 int32_t v04, int32_t v05, int32_t v06, int32_t v07,
                 int32_t v08, int32_t v09, int32_t v10, int32_t v11,
@@ -142,8 +150,14 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
     __m512i v;
 } POST_ALIGN(64) __vec16_i32;
 
+FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
+    m = _mm512_test_epi32_mask(in, in);
+}
+
 typedef struct PRE_ALIGN(64) __vec16_i64 {
-    __vec16_i64() { }
+  __forceinline __vec16_i64();
+  __forceinline __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {};
+  __forceinline __vec16_i64(__m512i l, __m512i h): v_lo(l), v_hi(h) {};
     __vec16_i64(int64_t v00, int64_t v01, int64_t v02, int64_t v03, 
                 int64_t v04, int64_t v05, int64_t v06, int64_t v07,
                 int64_t v08, int64_t v09, int64_t v10, int64_t v11,
@@ -167,9 +181,10 @@ typedef struct PRE_ALIGN(64) __vec16_i64 {
     __m512i v_lo;
 } POST_ALIGN(64) __vec16_i64;
 
-FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32& in) {
-    m = _mm512_test_epi32_mask(in, in);
-}
+FORCEINLINE __vec16_i64::__vec16_i64()
+  : v_lo(_mm512_undefined_epi32()),
+    v_hi(_mm512_undefined_epi32())
+{}
 
 template <typename T>
 struct vec16 {
@@ -619,7 +634,7 @@ static FORCEINLINE __vec16_i32 __ashr(__vec16_i32 a, int32_t n) {
     return _mm512_srai_epi32((__m512i)a, n); 
 }
 
-static FORCEINLINE __vec16_i1 __equal(__vec16_i32 a, __vec16_i32 b) {
+static FORCEINLINE __vec16_i1 __equal(const __vec16_i32 &a, const __vec16_i32 &b) {
     return _mm512_cmpeq_epi32_mask((__m512i)a, (__m512i)b);
 }
 
@@ -721,10 +736,26 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 ///////////////////////////////////////////////////////////////////////////
 // int64
 
-BINARY_OP(__vec16_i64, __add, +)
+
+static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
+{
+    __mmask16 carry = 0;
+    __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
+    __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
+    return __vec16_i64(lo, hi);
+}
+
 BINARY_OP(__vec16_i64, __sub, -)
 BINARY_OP(__vec16_i64, __mul, *)
 
+/*! 64x32 bit mul -- address computations often use a scale that we
+    know is 32 bits; and 32x64 is faster than 64x64 */
+static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b)
+{
+    return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo),
+                       _mm512_add_epi32(b.v_hi, _mm512_mulhi_epi32(a.v, b.v_lo)));
+}
+
 BINARY_OP(__vec16_i64, __or, |)
 BINARY_OP(__vec16_i64, __and, &)
 BINARY_OP(__vec16_i64, __xor, ^)
@@ -742,8 +773,15 @@ SHIFT_UNIFORM(__vec16_i64, uint64_t, __lshr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __ashr, >>)
 SHIFT_UNIFORM(__vec16_i64, int64_t, __shl, <<)
 
-CMP_OP(__vec16_i64, int64_t,  __equal, ==)
-CMP_OP(__vec16_i64, int64_t,  __not_equal, !=)
+static FORCEINLINE __vec16_i1 __equal(const __vec16_i64 &a, const __vec16_i64 &b) {
+    const __mmask16 lo_match = _mm512_cmpeq_epi32_mask(a.v_lo,b.v_lo);
+    return _mm512_mask_cmpeq_epi32_mask(lo_match,a.v_hi,b.v_hi);
+}
+
+static FORCEINLINE __vec16_i1 __not_equal(const __vec16_i64 &a, const __vec16_i64 &b) {
+    return __not(__equal(a,b));
+}
+
 CMP_OP(__vec16_i64, uint64_t, __unsigned_less_equal, <=)
 CMP_OP(__vec16_i64, int64_t,  __signed_less_equal, <=)
 CMP_OP(__vec16_i64, uint64_t, __unsigned_greater_equal, >=)
@@ -755,7 +793,18 @@ CMP_OP(__vec16_i64, int64_t,  __signed_greater_than, >)
 
 SELECT(__vec16_i64)
 INSERT_EXTRACT(__vec16_i64, int64_t)
-SMEAR(__vec16_i64, i64, int64_t)
+
+static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
+{
+    uint *src = (uint *)&v;
+    return src[index+16] | (int64_t(src[index]) << 32);
+}
+
+static FORCEINLINE  __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
+    const int *i = (const int*)&l;
+    return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
+}
+
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
@@ -765,10 +814,10 @@ LOAD_STORE(__vec16_i64, int64_t)
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
     __m512i v1;
     __m512i v2;
-    v1 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
 
     __vec16_i64 ret;
     ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
@@ -787,8 +836,8 @@ template <int ALIGN> static FORCEINLINE __vec16_i64 __load(__vec16_i64 *p) {
 }
 
 template <> static FORCEINLINE __vec16_i64 __load<64>(__vec16_i64 *p) {
-    __m512i v1 = _mm512_load_epi32(p);
-    __m512i v2 = _mm512_load_epi32(((uint8_t*)p)+64);
+    __m512i v2 = _mm512_load_epi32(p);
+    __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64);
     __vec16_i64 ret;
     ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
                                             _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0),
@@ -820,10 +869,10 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
     v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
                                       _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
                                       v.v_lo);
-    _mm512_extpackstorehi_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_epi32(p, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
-    _mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorehi_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
+    _mm512_extpackstorelo_epi32(((uint8_t*)p)+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
 }
 
 template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
@@ -841,8 +890,8 @@ template <> static FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) {
     v2 = _mm512_mask_permutevar_epi32(v2, 0x5555,
                                       _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0),
                                       v.v_lo);
-    _mm512_store_epi64(p, v1);
-    _mm512_store_epi64(((uint8_t*)p)+64, v2);
+    _mm512_store_epi64(p, v2);
+    _mm512_store_epi64(((uint8_t*)p)+64, v1);
 }
 
 
@@ -1161,7 +1210,11 @@ static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) {  \
 CAST_SEXT_I1(__vec16_i8)
 CAST_SEXT_I1(__vec16_i16)
 CAST_SEXT_I1(__vec16_i32)
-CAST_SEXT_I1(__vec16_i64)
+
+static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+    return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
+}
 
 // zero extension
 CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
@@ -1171,6 +1224,11 @@ CAST(__vec16_i32, uint32_t, __vec16_i16, uint16_t, __cast_zext)
 CAST(__vec16_i32, uint32_t, __vec16_i8,  uint8_t,  __cast_zext)
 CAST(__vec16_i16, uint16_t, __vec16_i8,  uint8_t,  __cast_zext)
 
+static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i32 &val)
+{
+    return __vec16_i64(val.v, _mm512_setzero_epi32());
+}
+
 #define CAST_ZEXT_I1(TYPE)
 /*
 static FORCEINLINE TYPE __cast_zext(TYPE, __vec16_i1 v) {  \
@@ -1459,8 +1517,11 @@ static FORCEINLINE __vec16_f __sqrt_varying_float(__vec16_f v) {
 }
 
 static FORCEINLINE __vec16_f __rcp_varying_float(__vec16_f v) {
+#ifdef ISPC_FAST_MATH
     return _mm512_recip_ps(v);
-    //return _mm512_rcp23_ps(v); // 23-bits of accuracy
+#else
+    return _mm512_rcp23_ps(v); // 23-bits of accuracy
+#endif
 }
 
 static FORCEINLINE __vec16_f __rsqrt_varying_float(__vec16_f v) {
@@ -1752,7 +1813,6 @@ GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
 GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
 GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
-//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __gather_base_offsets32_i32)
 GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
 GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
@@ -1777,7 +1837,7 @@ GATHER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __gather64_i16)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i32, __gather32_i32)
 GATHER_GENERAL(__vec16_i32, int32_t, __vec16_i64, __gather64_i32)
 GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i32, __gather32_i64)
-GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
+// GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64)
 
 /*
 static __forceinline __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) {
@@ -1820,7 +1880,6 @@ SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i
 SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
 SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
-//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i32, __scatter_base_offsets32_i32)
 SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
 SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
@@ -1828,14 +1887,21 @@ SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64
 static FORCEINLINE void
 __scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
                              uint32_t scale, __vec16_i32 constOffset,
-                             __vec16_i32 val, __vec16_i1 mask) {
-    __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
-    __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
-
-    // Loop is generated by the intrinsic
+                             __vec16_i32 val, __vec16_i1 mask)
+{
+    __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
     _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
 }
 
+static FORCEINLINE void 
+__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, 
+                               uint32_t scale, const __vec16_i32 &constOffset, 
+                               const __vec16_f &val, const __vec16_i1 mask) 
+{ 
+    __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
+    _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
+}
+
 #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
 /*
 static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
@@ -1846,8 +1912,7 @@ static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) {  \
             *ptr = val.v[i];                                         \
         }                                                            \
 }
-*/
-SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
+*/ SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i32, __scatter32_i8)
 SCATTER_GENERAL(__vec16_i8, int8_t, __vec16_i64, __scatter64_i8)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i32, __scatter32_i16)
 SCATTER_GENERAL(__vec16_i16, int16_t, __vec16_i64, __scatter64_i16)
@@ -2158,3 +2223,4 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #undef PRE_ALIGN
 #undef POST_ALIGN
 
+

From aecd6e0878d9a014d853c5307b569f75235bff8b Mon Sep 17 00:00:00 2001
From: Jean-Luc Duprat <jean-luc.duprat@intel.com>
Date: Tue, 17 Jul 2012 17:06:36 -0700
Subject: [PATCH 2/2] All the smear(), setzero() and undef() APIs are now
 templated on the return type. Modified ISPC's internal mangling to pass these
 through unchanged. Tried hard to make sure this is not going to introduce an
 ABI change.

---
 cbackend.cpp                     | 47 +++++++++---------
 examples/intrinsics/generic-16.h | 19 +++++---
 examples/intrinsics/generic-32.h | 20 +++++---
 examples/intrinsics/generic-64.h | 18 ++++---
 examples/intrinsics/knc.h        | 64 ++++++++++++++++---------
 examples/intrinsics/sse4.h       | 81 ++++++++++++++++++++------------
 stmt.cpp                         |  2 +-
 type.cpp                         | 10 ++--
 8 files changed, 163 insertions(+), 98 deletions(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index 1505dae6..9f6eb7eb 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -523,7 +523,7 @@ static std::string CBEMangle(const std::string &S) {
   std::string Result;
 
   for (unsigned i = 0, e = S.size(); i != e; ++i)
-    if (isalnum(S[i]) || S[i] == '_') {
+    if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
       Result += S[i];
     } else {
       Result += '_';
@@ -1115,29 +1115,35 @@ bool CWriter::printCast(unsigned opc, llvm::Type *SrcTy, llvm::Type *DstTy) {
 
 
 /** Construct the name of a function with the given base and returning a
-    vector of a given type.  For example, if base is "foo" and matchType is
-    i16, this will return the string "__foo_i16".
+    vector of a given type, of the specified idth.  For example, if base 
+    is "foo" and matchType is i32 and width is 16, this will return the 
+    string "__foo_i32<__vec16_i32>".
  */
 static const char *
-lGetTypedFunc(const char *base, llvm::Type *matchType) {
-    char buf[64];
-    sprintf(buf, "__%s_", base);
+lGetTypedFunc(const char *base, llvm::Type *matchType, int width) {
+    static const char *ty_desc_str[] = {"f", "d", "i1", "i8", "i16", "i32", "i64"};
+    static const char *fn_desc_str[] = {"float", "double", "i1", "i8", "i16", "i32", "i64"};
+    enum {DESC_FLOAT, DESC_DOUBLE, DESC_I1, DESC_I8, DESC_I16, DESC_I32, DESC_I64} desc;
+
     switch (matchType->getTypeID()) {
-    case llvm::Type::FloatTyID:  strcat(buf, "float");  break;
-    case llvm::Type::DoubleTyID: strcat(buf, "double"); break;
+    case llvm::Type::FloatTyID:  desc = DESC_FLOAT; break;
+    case llvm::Type::DoubleTyID: desc = DESC_DOUBLE; break;
     case llvm::Type::IntegerTyID: {
         switch (llvm::cast<llvm::IntegerType>(matchType)->getBitWidth()) {
-        case 1:  strcat(buf, "i1");  break;
-        case 8:  strcat(buf, "i8");  break;
-        case 16: strcat(buf, "i16"); break;
-        case 32: strcat(buf, "i32"); break;
-        case 64: strcat(buf, "i64"); break;
+        case 1:  desc = DESC_I1;  break;
+        case 8:  desc = DESC_I8;  break;
+        case 16: desc = DESC_I16; break;
+        case 32: desc = DESC_I32; break;
+        case 64: desc = DESC_I64; break;
         default: return NULL;
         }
         break;
     }
     default: return NULL;
     }
+
+    char buf[64];
+    snprintf(buf, 64, "__%s_%s<__vec%d_%s>", base, fn_desc_str[desc], width, ty_desc_str[desc]);
     return strdup(buf);
 }
 
@@ -1486,19 +1492,19 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
 
     if (llvm::isa<llvm::ConstantAggregateZero>(CPV)) {
         // All zeros; call the __setzero_* function.
-        const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType());
+        const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType(), vectorWidth);
         assert(setZeroFunc != NULL);
         Out << setZeroFunc << "()";
     }
     else if (llvm::isa<llvm::UndefValue>(CPV)) {
         // Undefined value; call __undef_* so that we can potentially pass
         // this information along..
-        const char *undefFunc = lGetTypedFunc("undef", VT->getElementType());
+        const char *undefFunc = lGetTypedFunc("undef", VT->getElementType(), vectorWidth);
         assert(undefFunc != NULL);
         Out << undefFunc << "()";
     }
     else {
-        const char *smearFunc = lGetTypedFunc("smear", VT->getElementType());
+        const char *smearFunc = lGetTypedFunc("smear", VT->getElementType(), vectorWidth);
 
         if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(CPV)) {
             llvm::Constant *splatValue = CV->getSplatValue();
@@ -1713,8 +1719,8 @@ void CWriter::printConstantWithCast(llvm::Constant* CPV, unsigned Opcode) {
 std::string CWriter::GetValueName(const llvm::Value *Operand) {
 
   // Resolve potential alias.
-    if (const llvm::GlobalAlias *GA = llvm::dyn_cast<llvm::GlobalAlias>(Operand)) {
-      if (const llvm::Value *V = GA->resolveAliasedGlobal(false))
+  if (const llvm::GlobalAlias *GA = llvm::dyn_cast<llvm::GlobalAlias>(Operand)) {
+    if (const llvm::Value *V = GA->resolveAliasedGlobal(false))
       Operand = V;
   }
 
@@ -4362,12 +4368,11 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
 
         {
         llvm::Type *matchType = toMatch->getType();
-        const char *smearFuncName = lGetTypedFunc("smear", matchType);
-
+        const char *smearFuncName = lGetTypedFunc("smear", matchType, vectorWidth);
         if (smearFuncName != NULL) {
             llvm::Function *smearFunc = module->getFunction(smearFuncName);
             if (smearFunc == NULL) {
-                // Declare the smar function if needed; it takes a single
+                // Declare the smear function if needed; it takes a single
                 // scalar parameter and returns a vector of the same
                 // parameter type.
                 llvm::Constant *sf = 
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 42978701..c4bff793 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -271,7 +271,8 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
     VTYPE ret;                                                     \
     for (int i = 0; i < 16; ++i)                                   \
         ret.v[i] = v;                                              \
@@ -279,7 +280,8 @@ static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
 }
 
 #define SETZERO(VTYPE, NAME)                                       \
-static FORCEINLINE VTYPE __setzero_##NAME() {                      \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
     VTYPE ret;                                                     \
     for (int i = 0; i < 16; ++i)                                   \
         ret.v[i] = 0;                                              \
@@ -287,7 +289,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() {                      \
 }
 
 #define UNDEF(VTYPE, NAME)                                         \
-static FORCEINLINE VTYPE __undef_##NAME() {                        \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {            \
     return VTYPE();                                                \
 }
 
@@ -416,18 +419,20 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec16_i1 __smear_i1(int v) {
+template <class RetVecType> __vec16_i1 __smear_i1(int i);
+template <> FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int v) {
     return __vec16_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v);
 }
 
-static FORCEINLINE __vec16_i1 __setzero_i1() {
+template <class RetVecType> __vec16_i1 __setzero_i1();
+template <> FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
     return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, 
                       0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-
-static FORCEINLINE __vec16_i1 __undef_i1() {
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
     return __vec16_i1();
 }
 
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 94946f4a..ffd5b478 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -336,15 +336,17 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {     \
     VTYPE ret;                                                     \
     for (int i = 0; i < 32; ++i)                                   \
         ret.v[i] = v;                                              \
     return ret;                                                    \
 }
 
-#define SETZERO(VTYPE, NAME)                   \
-static FORCEINLINE VTYPE __setzero_##NAME() {  \
+#define SETZERO(VTYPE, NAME)                                       \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {          \
     VTYPE ret;                                                     \
     for (int i = 0; i < 32; ++i)                                   \
         ret.v[i] = 0;                                              \
@@ -352,7 +354,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() {  \
 }
 
 #define UNDEF(VTYPE, NAME)                                         \
-static FORCEINLINE VTYPE __undef_##NAME() {                        \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {                 \
     return VTYPE();                                                \
 }
 
@@ -481,21 +484,24 @@ template <int ALIGN> static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec32_i1 __smear_i1(int v) {
+template <class RetVecType> __vec32_i1 __smear_i1(int i);
+template <> FORCEINLINE __vec32_i1 __smear_i1<__vec32_i1>(int v) {
     return __vec32_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v);
 }
 
-static FORCEINLINE __vec32_i1 __setzero_i1() {
+template <class RetVecType> __vec32_i1 __setzero_i1();
+template <> FORCEINLINE __vec32_i1 __setzero_i1<__vec32_i1>() {
     return __vec32_i1(0, 0, 0, 0, 0, 0, 0, 0, 
                       0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static FORCEINLINE __vec32_i1 __undef_i1() {
+template <class RetVecType> __vec32_i1 __undef_i1();
+template <> FORCEINLINE __vec32_i1 __undef_i1<__vec32_i1>() {
     return __vec32_i1();
 }
 
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index ff84fee3..a33e1d15 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -461,7 +461,8 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
+template <class RetVecType> VTYPE __smear_##NAME(STYPE);           \
+template <> FORCEINLINE VTYPE __smear_##NAME<VTYPE>(STYPE v) {          \
     VTYPE ret;                                                     \
     for (int i = 0; i < 64; ++i)                                   \
         ret.v[i] = v;                                              \
@@ -469,7 +470,8 @@ static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
 }
 
 #define SETZERO(VTYPE, NAME)                                       \
-static FORCEINLINE VTYPE __setzero_##NAME() {                      \
+template <class RetVecType> VTYPE __setzero_##NAME();              \
+template <> FORCEINLINE VTYPE __setzero_##NAME<VTYPE>() {               \
     VTYPE ret;                                                     \
     for (int i = 0; i < 64; ++i)                                   \
         ret.v[i] = 0;                                              \
@@ -477,7 +479,8 @@ static FORCEINLINE VTYPE __setzero_##NAME() {                      \
 }
 
 #define UNDEF(VTYPE, NAME)                                         \
-static FORCEINLINE VTYPE __undef_##NAME(VTYPE retType) {           \
+template <class RetVecType> VTYPE __undef_##NAME();                \
+template <> FORCEINLINE VTYPE __undef_##NAME<VTYPE>() {                 \
     return VTYPE();                                                \
 }
 
@@ -606,7 +609,8 @@ template <int ALIGN> static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec64_i1 __smear_i1(int v) {
+template <class RetVecType> __vec64_i1 __smear_i1(int i);
+template <> FORCEINLINE __vec64_i1 __smear_i1<__vec64_i1>(int v) {
     return __vec64_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v,
@@ -617,7 +621,8 @@ static FORCEINLINE __vec64_i1 __smear_i1(int v) {
                       v, v, v, v, v, v, v, v);
 }
 
-static FORCEINLINE __vec64_i1 __setzero_i1() {
+template <class RetVecType> __vec64_i1 __setzero_i1();
+template <> FORCEINLINE __vec64_i1 __setzero_i1<__vec64_i1>() {
     return __vec64_i1(0, 0, 0, 0, 0, 0, 0, 0, 
                       0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0,
@@ -628,7 +633,8 @@ static FORCEINLINE __vec64_i1 __setzero_i1() {
                       0, 0, 0, 0, 0, 0, 0, 0);
 }
 
-static FORCEINLINE __vec64_i1 __undef_i1() {
+template <class RetVecType> __vec64_i1 __undef_i1();
+template <> FORCEINLINE __vec64_i1 __undef_i1<__vec64_i1>() {
     return __vec64_i1();
 }
 
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 404cd24f..6abffe74 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -477,15 +477,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
     *ptr = v.m;
 }
 
-static FORCEINLINE __vec16_i1 __smear_i1(int i) {
+template <class RetVecType> __vec16_i1 __smear_i1(int i);
+template <> static FORCEINLINE __vec16_i1 __smear_i1<__vec16_i1>(int i) {
     return i?0xFFFF:0x0;
 }
 
-static FORCEINLINE __vec16_i1 __setzero_i1() {
+template <class RetVecType> __vec16_i1 __setzero_i1();
+template <> static FORCEINLINE __vec16_i1 __setzero_i1<__vec16_i1>() {
     return 0;
 }
 
-static FORCEINLINE __vec16_i1 __undef_i1() {
+template <class RetVecType> __vec16_i1 __undef_i1();
+template <> static FORCEINLINE __vec16_i1 __undef_i1<__vec16_i1>() {
     return __vec16_i1(); // FIXME? __mm512_undef_mask();
 }
 
@@ -744,15 +747,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b)
 static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }
 
-static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) {
+template <class RetVecType> __vec16_i32 __smear_i32(int32_t i);
+template <> static FORCEINLINE __vec16_i32 __smear_i32<__vec16_i32>(int32_t i) {
     return _mm512_set_1to16_epi32(i);
 }
 
-static FORCEINLINE __vec16_i32 __setzero_i32() {
+template <class RetVecType> __vec16_i32 __setzero_i32();
+template <> static FORCEINLINE __vec16_i32 __setzero_i32<__vec16_i32>() {
     return _mm512_setzero_epi32();
 }
 
-static FORCEINLINE __vec16_i32 __undef_i32() {
+template <class RetVecType> __vec16_i32 __undef_i32();
+template <> static FORCEINLINE __vec16_i32 __undef_i32<__vec16_i32>() {
     return _mm512_undefined_epi32();
 }
 
@@ -803,15 +809,24 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
 // int64
 
 
-static FORCEINLINE __vec16_i64 __setzero_i64() {
+template <class RetVecType> __vec16_i64 __setzero_i64();
+template <> static FORCEINLINE __vec16_i64 __setzero_i64<__vec16_i64>() {
     __vec16_i64 ret;
     ret.v_lo = _mm512_setzero_epi32();
     ret.v_hi = _mm512_setzero_epi32();
     return ret;
 }
 
+template <class RetVecType> __vec16_i64 __undef_i64();
+template <> static FORCEINLINE __vec16_i64 __undef_i64<__vec16_i64>() {
+    __vec16_i64 ret;
+    ret.v_lo = _mm512_undefined_epi32();
+    ret.v_hi = _mm512_undefined_epi32();
+    return ret;
+}
+
 static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
-{
+{    
     __mmask16 carry = 0;
     __m512i lo = _mm512_addsetc_epi32(a.v_lo, b.v_lo, &carry);
     __m512i hi = _mm512_adc_epi32(a.v_hi, carry, b.v_hi, &carry);
@@ -885,7 +900,8 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
     return src[index+16] | (int64_t(src[index]) << 32);
 }
 
-static FORCEINLINE  __vec16_i64 __smear_i64(const int64_t &l) {
+template <class RetVecType> __vec16_i64 __smear_i64(const int64_t &l);
+template <> FORCEINLINE  __vec16_i64 __smear_i64<__vec16_i64>(const int64_t &l) {
     const int *i = (const int*)&l;
     return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
 }
@@ -897,12 +913,12 @@ LOAD_STORE(__vec16_i64, int64_t)
 
 
 template <int ALIGN> static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) {
-    __m512i v1;
-    __m512i v2;
-    v2 = _mm512_extloadunpackhi_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v2 = _mm512_extloadunpacklo_epi32(v1, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpackhi_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
-    v1 = _mm512_extloadunpacklo_epi32(v2, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    __vec16_i32 v1 = _mm512_undefined_epi32();
+    __vec16_i32 v2 = _mm512_undefined_epi32();
+    v2 = _mm512_extloadunpackhi_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v2 = _mm512_extloadunpacklo_epi32(v2, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v1 = _mm512_extloadunpackhi_epi32(v1, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
+    v1 = _mm512_extloadunpacklo_epi32(v1, ((uint8_t*)p)+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
 
     __vec16_i64 ret;
     ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00,
@@ -1078,15 +1094,18 @@ static FORCEINLINE void  __insert_element(__vec16_f *v, int index, float val) {
     ((float *)v)[index] = val;
 }
 
-static FORCEINLINE __vec16_f __smear_float(float f) {
+template <class RetVecType> __vec16_f __smear_float(float f);
+template <> static FORCEINLINE __vec16_f __smear_float<__vec16_f>(float f) {
     return _mm512_set_1to16_ps(f);
 }
 
-static FORCEINLINE __vec16_f __setzero_float() {
+template <class RetVecType> __vec16_f __setzero_float();
+template <> static FORCEINLINE __vec16_f __setzero_float<__vec16_f>() {
     return _mm512_setzero_ps();
 }
 
-static FORCEINLINE __vec16_f __undef_float() {
+template <class RetVecType> __vec16_f __undef_float();
+template <> static FORCEINLINE __vec16_f __undef_float<__vec16_f>() {
     return _mm512_undefined_ps();
 }
 
@@ -1287,21 +1306,24 @@ static FORCEINLINE void  __insert_element(__vec16_d *v, int index, double val) {
     ((double *)v)[index] = val;
 }
 
-static FORCEINLINE __vec16_d __smear_double(double d) {
+template <class RetVecType> __vec16_d __smear_double(double d);
+template <> static FORCEINLINE __vec16_d __smear_double<__vec16_d>(double d) {
     __vec16_d ret;
     ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
     ret.v2 = ret.v1;
     return ret;
 }
 
-static FORCEINLINE __vec16_d __setzero_double() {
+template <class RetVecType> __vec16_d __setzero_double();
+template <> static FORCEINLINE __vec16_d __setzero_double<__vec16_d>() {
     __vec16_d ret;
     ret.v1 = _mm512_setzero_pd();
     ret.v2 = ret.v1;
     return ret;
 }
 
-static FORCEINLINE __vec16_d __undef_double() {
+template <class RetVecType> __vec16_d __undef_double();
+template <> static FORCEINLINE __vec16_d __undef_double<__vec16_d>() {
     __vec16_d ret;
     ret.v1 = _mm512_undefined_pd();
     ret.v2 = ret.v1;
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 17ab8f18..b894cb29 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -322,15 +322,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 val
     _mm_storeu_ps((float *)(&p->v), value.v);
 }
 
-static FORCEINLINE __vec4_i1 __smear_i1(int v) {
+template <class RetVecType> __vec4_i1 __smear_i1(int v);
+template <> FORCEINLINE __vec4_i1 __smear_i1<__vec4_i1>(int v) {
     return __vec4_i1(v, v, v, v);
 }
 
-static FORCEINLINE __vec4_i1 __setzero_i1() {
+template <class RetVecType> __vec4_i1 __setzero_i1();
+template <> FORCEINLINE __vec4_i1 __setzero_i1<__vec4_i1>() {
     return __vec4_i1(_mm_setzero_ps());
 }
 
-static FORCEINLINE __vec4_i1 __undef_i1() {
+template <class RetVecType> __vec4_i1 __undef_i1();
+template <> FORCEINLINE __vec4_i1 __undef_i1<__vec4_i1>() {
     return __vec4_i1();
 }
 
@@ -560,15 +563,18 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
     ((int8_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
+template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
+template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
     return _mm_set1_epi8(v);
 }
 
-static FORCEINLINE __vec4_i8 __setzero_i8() {
+template <class RetVecType> __vec4_i8 __setzero_i8();
+template <> FORCEINLINE __vec4_i8 __setzero_i8<__vec4_i8>() {
     return _mm_set1_epi8(0);
 }
 
-static FORCEINLINE __vec4_i8 __undef_i8() {
+template <class RetVecType> __vec4_i8 __undef_i8();
+template <> FORCEINLINE __vec4_i8 __undef_i8<__vec4_i8>() {
     return __vec4_i8();
 }
 
@@ -829,15 +835,18 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val)
     ((int16_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
+template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
+template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
     return _mm_set1_epi16(v);
 }
 
-static FORCEINLINE __vec4_i16 __setzero_i16() {
+template <class RetVecType> __vec4_i16 __setzero_i16();
+template <> FORCEINLINE __vec4_i16 __setzero_i16<__vec4_i16>() {
     return _mm_set1_epi16(0);
 }
 
-static FORCEINLINE __vec4_i16 __undef_i16() {
+template <class RetVecType> __vec4_i16 __undef_i16();
+template <> FORCEINLINE __vec4_i16 __undef_i16<__vec4_i16>() {
     return __vec4_i16();
 }
 
@@ -1076,15 +1085,18 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32
                                           _mm_castsi128_ps(a.v), mask.v));
 }
 
-static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
+template <class RetVecType> __vec4_i32 __smear_i32(int32_t v);
+template <> FORCEINLINE __vec4_i32 __smear_i32<__vec4_i32>(int32_t v) {
     return _mm_set1_epi32(v);
 }
 
-static FORCEINLINE __vec4_i32 __setzero_i32() {
+template <class RetVecType> __vec4_i32 __setzero_i32();
+template <> FORCEINLINE __vec4_i32 __setzero_i32<__vec4_i32>() {
     return _mm_castps_si128(_mm_setzero_ps());
 }
 
-static FORCEINLINE __vec4_i32 __undef_i32() {
+template <class RetVecType> __vec4_i32 __undef_i32();
+template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() {
     return __vec4_i32();
 }
 
@@ -1347,15 +1359,18 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64
     return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
 }
 
-static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
+template <class RetVecType> __vec4_i64 __smear_i64(int64_t v);
+template <> FORCEINLINE __vec4_i64 __smear_i64<__vec4_i64>(int64_t v) {
     return __vec4_i64(v, v, v, v);
 }
 
-static FORCEINLINE __vec4_i64 __setzero_i64() {
+template <class RetVecType> __vec4_i64 __setzero_i64();
+template <> FORCEINLINE __vec4_i64 __setzero_i64<__vec4_i64>() {
     return __vec4_i64(0, 0, 0, 0);
 }
 
-static FORCEINLINE __vec4_i64 __undef_i64() {
+template <class RetVecType> __vec4_i64 __undef_i64();
+template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() {
     return __vec4_i64();
 }
 
@@ -1465,15 +1480,18 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE __vec4_f __smear_float(float v) {
+template <class RetVecType> __vec4_f __smear_float(float v);
+template <> FORCEINLINE __vec4_f __smear_float<__vec4_f>(float v) {
     return _mm_set1_ps(v);
 }
 
-static FORCEINLINE __vec4_f __setzero_float() {
+template <class RetVecType> __vec4_f __setzero_float();
+template <> FORCEINLINE __vec4_f __setzero_float<__vec4_f>() {
     return _mm_setzero_ps();
 }
 
-static FORCEINLINE __vec4_f __undef_float() {
+template <class RetVecType> __vec4_f __undef_float();
+template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() {
     return __vec4_f();
 }
 
@@ -1614,15 +1632,18 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
     return __vec4_d(r0, r1);
 }
 
-static FORCEINLINE __vec4_d __smear_double(double v) {
+template <class RetVecType> __vec4_d __smear_double(double v);
+template <> FORCEINLINE __vec4_d __smear_double<__vec4_d>(double v) {
     return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
 }
 
-static FORCEINLINE __vec4_d __setzero_double() {
+template <class RetVecType> __vec4_d __setzero_double();
+template <> FORCEINLINE __vec4_d __setzero_double<__vec4_d>() {
     return __vec4_d(_mm_setzero_pd(), _mm_setzero_pd());
 }
 
-static FORCEINLINE __vec4_d __undef_double() {
+template <class RetVecType> __vec4_d __undef_double();
+template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() {
     return __vec4_d();
 }
 
@@ -1722,11 +1743,11 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(0xff), __setzero_i8());
+    return __select(v, __smear_i8<__vec4_i8>(0xff), __setzero_i8<__vec4_i8>());
 }
 
 static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(0xffff), __setzero_i16());
+    return __select(v, __smear_i16<__vec4_i16>(0xffff), __setzero_i16<__vec4_i16>());
 }
 
 static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
@@ -1786,11 +1807,11 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(1), __setzero_i8());
+    return __select(v, __smear_i8<__vec4_i8>(1), __setzero_i8<__vec4_i8>());
 }
 
 static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(1), __setzero_i16());
+    return __select(v, __smear_i16<__vec4_i16>(1), __setzero_i16<__vec4_i16>());
 }
 
 static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
@@ -1798,7 +1819,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
 }
 
 static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
-    return __select(v, __smear_i64(1), __setzero_i64());
+    return __select(v, __smear_i64<__vec4_i64>(1), __setzero_i64<__vec4_i64>());
 }
 
 // truncations
@@ -1958,11 +1979,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
 }
 
 static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
-    return __select(v, __smear_float(1.), __setzero_float());
+    return __select(v, __smear_float<__vec4_f>(1.), __setzero_float<__vec4_f>());
 }
 
 static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
-    return __select(v, __smear_double(1.), __setzero_double());
+    return __select(v, __smear_double<__vec4_d>(1.), __setzero_double<__vec4_d>());
 }
 
 // float/double to signed int
@@ -2897,7 +2918,7 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __setzero_i32());
+    offsets = __select(mask, offsets, __setzero_i32<__vec4_i32>());
 
     int offset = scale * _mm_extract_epi32(offsets.v, 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2954,7 +2975,7 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __setzero_i64());
+    offsets = __select(mask, offsets, __setzero_i64<__vec4_i64>());
 
     int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
diff --git a/stmt.cpp b/stmt.cpp
index 04807faf..4aaac257 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -2207,7 +2207,7 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
         // lane's value of the varying expression is the same as the value
         // we've selected to process this time through--i.e.:
         // oldMask & (smear(element) == exprValue)
-        llvm::Value *uniqueSmear = ctx->SmearUniform(uniqueValue, "unique_semar");
+        llvm::Value *uniqueSmear = ctx->SmearUniform(uniqueValue, "unique_smear");
         llvm::Value *matchingLanes = NULL;
         if (uniqueValue->getType()->isFloatingPointTy())
             matchingLanes = 
diff --git a/type.cpp b/type.cpp
index 272de7a4..942ff45c 100644
--- a/type.cpp
+++ b/type.cpp
@@ -1060,12 +1060,12 @@ PointerType::Mangle() const {
         return "";
     }
 
-    std::string ret = variability.MangleString() + std::string("<");
+    std::string ret = variability.MangleString() + std::string("_3C_"); // <
     if (isSlice || isFrozen)  ret += "-";
     if (isSlice) ret += "s";
     if (isFrozen) ret += "f";
     if (isSlice || isFrozen)  ret += "-";
-    return ret + baseType->Mangle() + std::string(">");
+    return ret + baseType->Mangle() + std::string("_3E_"); // >
 }
 
 
@@ -1636,7 +1636,7 @@ std::string
 VectorType::Mangle() const {
     std::string s = base->Mangle();
     char buf[16];
-    sprintf(buf, "<%d>", numElements);
+    sprintf(buf, "_3C_%d_3E_", numElements); // "<%d>"
     return s + std::string(buf);
 }
 
@@ -1789,7 +1789,7 @@ lMangleStructName(const std::string &name, Variability variability) {
         n += buf;
         break;
     default:
-        FATAL("Unexpected varaibility in lMangleStructName()");
+        FATAL("Unexpected variability in lMangleStructName()");
     }
     
     // And stuff the name at the end....
@@ -2049,7 +2049,7 @@ std::string
 StructType::Mangle() const {
     return lMangleStruct(variability, isConst, name);
 }
-    
+
 
 std::string
 StructType::GetCDeclaration(const std::string &n) const {