From 0d3993fa251ce9a65728dfd96c49214531b12269 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Thu, 5 Jul 2012 20:19:11 -0700
Subject: [PATCH] More varied support for constant vectors from C++ backend.

If we have a vector of all zeros, a __setzero_* function call is emitted,
permitting calling specialized intrinsics for this.  Undefined values
are reflected with an __undef_* call, which similarly allows passing that
information along.

This change also includes a cleanup to the signature of the __smear_*
functions; since they already have different names depending on the
scalar value type, we don't need to use the trick of passing an
undefined value of the return vector type as the first parameter as
an indirect way to overload by return value.

Issue #317.
---
 builtins/target-generic-common.ll |  26 ++++--
 cbackend.cpp                      | 144 ++++++++++++++++--------------
 examples/intrinsics/generic-16.h  |  39 +++++++-
 examples/intrinsics/generic-32.h  |  40 ++++++++-
 examples/intrinsics/generic-64.h  |  44 ++++++++-
 examples/intrinsics/knc.h         |  46 +++++++++-
 examples/intrinsics/sse4.h        |  95 +++++++++++++++-----
 7 files changed, 330 insertions(+), 104 deletions(-)
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 9cedf4e4..44320843 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -39,12 +39,26 @@ reduce_equal(WIDTH)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; broadcast/rotate/shuffle
 
-declare <WIDTH x float> @__smear_float(<WIDTH x float>, float) nounwind readnone
-declare <WIDTH x double> @__smear_double(<WIDTH x double>, double) nounwind readnone
-declare <WIDTH x i8> @__smear_i8(<WIDTH x i8>, i8) nounwind readnone
-declare <WIDTH x i16> @__smear_i16(<WIDTH x i16>, i16) nounwind readnone
-declare <WIDTH x i32> @__smear_i32(<WIDTH x i32>, i32) nounwind readnone
-declare <WIDTH x i64> @__smear_i64(<WIDTH x i64>, i64) nounwind readnone
+declare <WIDTH x float> @__smear_float(float) nounwind readnone
+declare <WIDTH x double> @__smear_double(double) nounwind readnone
+declare <WIDTH x i8> @__smear_i8(i8) nounwind readnone
+declare <WIDTH x i16> @__smear_i16(i16) nounwind readnone
+declare <WIDTH x i32> @__smear_i32(i32) nounwind readnone
+declare <WIDTH x i64> @__smear_i64(i64) nounwind readnone
+
+declare <WIDTH x float> @__setzero_float() nounwind readnone
+declare <WIDTH x double> @__setzero_double() nounwind readnone
+declare <WIDTH x i8> @__setzero_i8() nounwind readnone
+declare <WIDTH x i16> @__setzero_i16() nounwind readnone
+declare <WIDTH x i32> @__setzero_i32() nounwind readnone
+declare <WIDTH x i64> @__setzero_i64() nounwind readnone
+
+declare <WIDTH x float> @__undef_float() nounwind readnone
+declare <WIDTH x double> @__undef_double() nounwind readnone
+declare <WIDTH x i8> @__undef_i8() nounwind readnone
+declare <WIDTH x i16> @__undef_i16() nounwind readnone
+declare <WIDTH x i32> @__undef_i32() nounwind readnone
+declare <WIDTH x i64> @__undef_i64() nounwind readnone
 
 declare <WIDTH x float> @__broadcast_float(<WIDTH x float>, i32) nounwind readnone
 declare <WIDTH x double> @__broadcast_double(<WIDTH x double>, i32) nounwind readnone
diff --git a/cbackend.cpp b/cbackend.cpp
index 280589ca..20a6d210 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -1098,22 +1098,31 @@ bool CWriter::printCast(unsigned opc, llvm::Type *SrcTy, llvm::Type *DstTy) {
 }
 
 
-// FIXME: generalize this/make it not so hard-coded?
-static const char *lGetSmearFunc(llvm::Type *matchType) {
+/** Construct the name of a function with the given base and returning a
+    vector of a given type.  For example, if base is "foo" and matchType is
+    i16, this will return the string "__foo_i16".
+ */
+static const char *
+lGetTypedFunc(const char *base, llvm::Type *matchType) {
+    char buf[64];
+    sprintf(buf, "__%s_", base);
     switch (matchType->getTypeID()) {
-    case llvm::Type::FloatTyID:  return "__smear_float";
-    case llvm::Type::DoubleTyID: return "__smear_double";
+    case llvm::Type::FloatTyID:  strcat(buf, "float");  break;
+    case llvm::Type::DoubleTyID: strcat(buf, "double"); break;
     case llvm::Type::IntegerTyID: {
         switch (llvm::cast<llvm::IntegerType>(matchType)->getBitWidth()) {
-        case 1:  return "__smear_i1";
-        case 8:  return "__smear_i8";
-        case 16: return "__smear_i16";
-        case 32: return "__smear_i32";
-        case 64: return "__smear_i64";
+        case 1:  strcat(buf, "i1");  break;
+        case 8:  strcat(buf, "i8");  break;
+        case 16: strcat(buf, "i16"); break;
+        case 32: strcat(buf, "i32"); break;
+        case 64: strcat(buf, "i64"); break;
+        default: return NULL;
         }
+        break;
     }
     default: return NULL;
     }
+    return strdup(buf);
 }
 
 
@@ -1458,64 +1467,63 @@ void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
   }
   case llvm::Type::VectorTyID: {
     llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(CPV->getType());
-    const char *smearFunc = lGetSmearFunc(VT->getElementType());
 
     if (llvm::isa<llvm::ConstantAggregateZero>(CPV)) {
-        assert(smearFunc != NULL);
-
-        llvm::Constant *CZ = llvm::Constant::getNullValue(VT->getElementType());
-        Out << smearFunc << "(";
-        printType(Out, VT);
-        Out << "(), ";
-        printConstant(CZ, Static);
-        Out << ")";
+        // All zeros; call the __setzero_* function.
+        const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType());
+        assert(setZeroFunc != NULL);
+        Out << setZeroFunc << "()";
     }
-    else if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(CPV)) {
-      llvm::Constant *splatValue = CV->getSplatValue();
-      if (splatValue != NULL && smearFunc != NULL) {
-        Out << smearFunc << "(";
-        printType(Out, VT);
-        Out << "(), ";
-        printConstant(splatValue, Static);
-        Out << ")";
-      }
-      else {
-        printType(Out, CPV->getType());
-        Out << "(";
-        printConstantVector(CV, Static);
-        Out << ")";
-      }
+    else if (llvm::isa<llvm::UndefValue>(CPV)) {
+        // Undefined value; call __undef_* so that we can potentially pass
+        // this information along..
+        const char *undefFunc = lGetTypedFunc("undef", VT->getElementType());
+        assert(undefFunc != NULL);
+        Out << undefFunc << "()";
     }
-#ifndef LLVM_3_0
-    else if (llvm::ConstantDataVector *CDV = llvm::dyn_cast<llvm::ConstantDataVector>(CPV)) {
-      llvm::Constant *splatValue = CDV->getSplatValue();
-      if (splatValue != NULL && smearFunc != NULL) {
-        Out << smearFunc << "(";
-        printType(Out, VT);
-        Out << "(), ";
-        printConstant(splatValue, Static);
-        Out << ")";
-      }
-      else {
-        printType(Out, CPV->getType());
-        Out << "(";
-        printConstantDataSequential(CDV, Static);
-        Out << ")";
-      }
-    }
-#endif // !LLVM_3_0
     else {
-      assert(llvm::isa<llvm::UndefValue>(CPV));
-      llvm::Constant *CZ = llvm::Constant::getNullValue(VT->getElementType());
-      printType(Out, CPV->getType());
-      Out << "(";
-      printConstant(CZ, Static);
-      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
-        Out << ", ";
-        printConstant(CZ, Static);
-      }
-      Out << ")";
+        const char *smearFunc = lGetTypedFunc("smear", VT->getElementType());
+
+        if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(CPV)) {
+            llvm::Constant *splatValue = CV->getSplatValue();
+            if (splatValue != NULL && smearFunc != NULL) {
+                // If it's a basic type and has a __smear_* function, then
+                // call that.
+                Out << smearFunc << "(";
+                printConstant(splatValue, Static);
+                Out << ")";
+            }
+            else {
+                // Otherwise call the constructor for the type
+                printType(Out, CPV->getType());
+                Out << "(";
+                printConstantVector(CV, Static);
+                Out << ")";
+            }
+        }
+#ifndef LLVM_3_0
+        // LLVM 3.1 and beyond have a different representation of constant vectors..
+        else if (llvm::ConstantDataVector *CDV =
+                 llvm::dyn_cast<llvm::ConstantDataVector>(CPV)) {
+            llvm::Constant *splatValue = CDV->getSplatValue();
+            if (splatValue != NULL && smearFunc != NULL) {
+                Out << smearFunc << "(";
+                printConstant(splatValue, Static);
+                Out << ")";
+            }
+            else {
+                printType(Out, CPV->getType());
+                Out << "(";
+                printConstantDataSequential(CDV, Static);
+                Out << ")";
+            }
+        }
+#endif // !LLVM_3_0
+        else {
+            llvm::report_fatal_error("Unexpected vector type");
+        }
     }
+    
     break;
   }
   case llvm::Type::StructTyID:
@@ -4194,7 +4202,7 @@ char SmearCleanupPass::ID = 0;
 
 
 static int
-  lChainLength(llvm::InsertElementInst *inst) {
+lChainLength(llvm::InsertElementInst *inst) {
     int length = 0;
     while (inst != NULL) {
         ++length;
@@ -4242,24 +4250,26 @@ SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
 
         {
         llvm::Type *matchType = toMatch->getType();
-        const char *smearFuncName = lGetSmearFunc(matchType);
+        const char *smearFuncName = lGetTypedFunc("smear", matchType);
 
         if (smearFuncName != NULL) {
             llvm::Function *smearFunc = module->getFunction(smearFuncName);
             if (smearFunc == NULL) {
+                // Declare the smar function if needed; it takes a single
+                // scalar parameter and returns a vector of the same
+                // parameter type.
                 llvm::Constant *sf = 
                     module->getOrInsertFunction(smearFuncName, iter->getType(), 
-                                                iter->getType(), matchType, NULL);
+                                                matchType, NULL);
                 smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                 assert(smearFunc != NULL);
                 smearFunc->setDoesNotThrow(true);
                 smearFunc->setDoesNotAccessMemory(true);
             }
 
-            llvm::Value *undefResult = llvm::UndefValue::get(vt);
             assert(smearFunc != NULL);
-            llvm::Value *args[2] = { undefResult, toMatch };
-            llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
+            llvm::Value *args[1] = { toMatch };
+            llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
             llvm::Instruction *smearCall = 
                 llvm::CallInst::Create(smearFunc, argArray, LLVMGetName(toMatch, "_smear"),
                                  (llvm::Instruction *)NULL);
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 6f4e5ed9..438b4d5f 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -262,13 +262,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
     VTYPE ret;                                                     \
     for (int i = 0; i < 16; ++i)                                   \
         ret.v[i] = v;                                              \
     return ret;                                                    \
 }
 
+#define SETZERO(VTYPE, NAME)                                       \
+static FORCEINLINE VTYPE __setzero_##NAME() {                      \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 16; ++i)                                   \
+        ret.v[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+static FORCEINLINE VTYPE __undef_##NAME() {                        \
+    return VTYPE();                                                \
+}
+
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -394,11 +407,21 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int v) {
+static FORCEINLINE __vec16_i1 __smear_i1(int v) {
     return __vec16_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v);
 }
 
+static FORCEINLINE __vec16_i1 __setzero_i1() {
+    return __vec16_i1(0, 0, 0, 0, 0, 0, 0, 0, 
+                      0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static FORCEINLINE __vec16_i1 __undef_i1() {
+    return __vec16_i1();
+}
+
 
 ///////////////////////////////////////////////////////////////////////////
 // int8
@@ -438,6 +461,8 @@ CMP_OP(__vec16_i8, int8_t,  __signed_greater_than, >)
 SELECT(__vec16_i8)
 INSERT_EXTRACT(__vec16_i8, int8_t)
 SMEAR(__vec16_i8, i8, int8_t)
+SETZERO(__vec16_i8, i8)
+UNDEF(__vec16_i8, i8)
 BROADCAST(__vec16_i8, i8, int8_t)
 ROTATE(__vec16_i8, i8, int8_t)
 SHUFFLES(__vec16_i8, i8, int8_t)
@@ -481,6 +506,8 @@ CMP_OP(__vec16_i16, int16_t,  __signed_greater_than, >)
 SELECT(__vec16_i16)
 INSERT_EXTRACT(__vec16_i16, int16_t)
 SMEAR(__vec16_i16, i16, int16_t)
+SETZERO(__vec16_i16, i16)
+UNDEF(__vec16_i16, i16)
 BROADCAST(__vec16_i16, i16, int16_t)
 ROTATE(__vec16_i16, i16, int16_t)
 SHUFFLES(__vec16_i16, i16, int16_t)
@@ -524,6 +551,8 @@ CMP_OP(__vec16_i32, int32_t,  __signed_greater_than, >)
 SELECT(__vec16_i32)
 INSERT_EXTRACT(__vec16_i32, int32_t)
 SMEAR(__vec16_i32, i32, int32_t)
+SETZERO(__vec16_i32, i32)
+UNDEF(__vec16_i32, i32)
 BROADCAST(__vec16_i32, i32, int32_t)
 ROTATE(__vec16_i32, i32, int32_t)
 SHUFFLES(__vec16_i32, i32, int32_t)
@@ -567,6 +596,8 @@ CMP_OP(__vec16_i64, int64_t,  __signed_greater_than, >)
 SELECT(__vec16_i64)
 INSERT_EXTRACT(__vec16_i64, int64_t)
 SMEAR(__vec16_i64, i64, int64_t)
+SETZERO(__vec16_i64, i64)
+UNDEF(__vec16_i64, i64)
 BROADCAST(__vec16_i64, i64, int64_t)
 ROTATE(__vec16_i64, i64, int64_t)
 SHUFFLES(__vec16_i64, i64, int64_t)
@@ -602,6 +633,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
 SELECT(__vec16_f)
 INSERT_EXTRACT(__vec16_f, float)
 SMEAR(__vec16_f, float, float)
+SETZERO(__vec16_f, float)
+UNDEF(__vec16_f, float)
 BROADCAST(__vec16_f, float, float)
 ROTATE(__vec16_f, float, float)
 SHUFFLES(__vec16_f, float, float)
@@ -752,6 +785,8 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
 SELECT(__vec16_d)
 INSERT_EXTRACT(__vec16_d, double)
 SMEAR(__vec16_d, double, double)
+SETZERO(__vec16_d, double)
+UNDEF(__vec16_d, double)
 BROADCAST(__vec16_d, double, double)
 ROTATE(__vec16_d, double, double)
 SHUFFLES(__vec16_d, double, double)
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 86a1ea9b..dc55fb00 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -327,13 +327,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
     VTYPE ret;                                                     \
     for (int i = 0; i < 32; ++i)                                   \
         ret.v[i] = v;                                              \
     return ret;                                                    \
 }
 
+#define SETZERO(VTYPE, NAME)                   \
+static FORCEINLINE VTYPE __setzero_##NAME() {  \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 32; ++i)                                   \
+        ret.v[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+static FORCEINLINE VTYPE __undef_##NAME() {                        \
+    return VTYPE();                                                \
+}
+
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -459,13 +472,24 @@ template <int ALIGN> static FORCEINLINE void __store(__vec32_i1 *p, __vec32_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec32_i1 __smear_i1(__vec32_i1, int v) {
+static FORCEINLINE __vec32_i1 __smear_i1(int v) {
     return __vec32_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v);
 }
 
+static FORCEINLINE __vec32_i1 __setzero_i1() {
+    return __vec32_i1(0, 0, 0, 0, 0, 0, 0, 0, 
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static FORCEINLINE __vec32_i1 __undef_i1() {
+    return __vec32_i1();
+}
+
 
 ///////////////////////////////////////////////////////////////////////////
 // int8
@@ -505,6 +529,8 @@ CMP_OP(__vec32_i8, int8_t,  __signed_greater_than, >)
 SELECT(__vec32_i8)
 INSERT_EXTRACT(__vec32_i8, int8_t)
 SMEAR(__vec32_i8, i8, int8_t)
+SETZERO(__vec32_i8, i8)
+UNDEF(__vec32_i8, i8)
 BROADCAST(__vec32_i8, i8, int8_t)
 ROTATE(__vec32_i8, i8, int8_t)
 SHUFFLES(__vec32_i8, i8, int8_t)
@@ -548,6 +574,8 @@ CMP_OP(__vec32_i16, int16_t,  __signed_greater_than, >)
 SELECT(__vec32_i16)
 INSERT_EXTRACT(__vec32_i16, int16_t)
 SMEAR(__vec32_i16, i16, int16_t)
+SETZERO(__vec32_i16, i16)
+UNDEF(__vec32_i16, i16)
 BROADCAST(__vec32_i16, i16, int16_t)
 ROTATE(__vec32_i16, i16, int16_t)
 SHUFFLES(__vec32_i16, i16, int16_t)
@@ -591,6 +619,8 @@ CMP_OP(__vec32_i32, int32_t,  __signed_greater_than, >)
 SELECT(__vec32_i32)
 INSERT_EXTRACT(__vec32_i32, int32_t)
 SMEAR(__vec32_i32, i32, int32_t)
+SETZERO(__vec32_i32, i32)
+UNDEF(__vec32_i32, i32)
 BROADCAST(__vec32_i32, i32, int32_t)
 ROTATE(__vec32_i32, i32, int32_t)
 SHUFFLES(__vec32_i32, i32, int32_t)
@@ -634,6 +664,8 @@ CMP_OP(__vec32_i64, int64_t,  __signed_greater_than, >)
 SELECT(__vec32_i64)
 INSERT_EXTRACT(__vec32_i64, int64_t)
 SMEAR(__vec32_i64, i64, int64_t)
+SETZERO(__vec32_i64, i64)
+UNDEF(__vec32_i64, i64)
 BROADCAST(__vec32_i64, i64, int64_t)
 ROTATE(__vec32_i64, i64, int64_t)
 SHUFFLES(__vec32_i64, i64, int64_t)
@@ -669,6 +701,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
 SELECT(__vec32_f)
 INSERT_EXTRACT(__vec32_f, float)
 SMEAR(__vec32_f, float, float)
+SETZERO(__vec32_f, float)
+UNDEF(__vec32_f, float)
 BROADCAST(__vec32_f, float, float)
 ROTATE(__vec32_f, float, float)
 SHUFFLES(__vec32_f, float, float)
@@ -819,6 +853,8 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
 SELECT(__vec32_d)
 INSERT_EXTRACT(__vec32_d, double)
 SMEAR(__vec32_d, double, double)
+SETZERO(__vec32_d, double)
+UNDEF(__vec32_d, double)
 BROADCAST(__vec32_d, double, double)
 ROTATE(__vec32_d, double, double)
 SHUFFLES(__vec32_d, double, double)
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 59908700..f1eb22ae 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -452,13 +452,26 @@ static FORCEINLINE TYPE NAME(TYPE a, int32_t b) {                   \
 }
 
 #define SMEAR(VTYPE, NAME, STYPE)                                  \
-static FORCEINLINE VTYPE __smear_##NAME(VTYPE retType, STYPE v) {  \
+static FORCEINLINE VTYPE __smear_##NAME(STYPE v) {                 \
     VTYPE ret;                                                     \
     for (int i = 0; i < 64; ++i)                                   \
         ret.v[i] = v;                                              \
     return ret;                                                    \
 }
 
+#define SETZERO(VTYPE, NAME)                                       \
+static FORCEINLINE VTYPE __setzero_##NAME() {                      \
+    VTYPE ret;                                                     \
+    for (int i = 0; i < 64; ++i)                                   \
+        ret.v[i] = 0;                                              \
+    return ret;                                                    \
+}
+
+#define UNDEF(VTYPE, NAME)                                         \
+static FORCEINLINE VTYPE __undef_##NAME(VTYPE retType) {           \
+    return VTYPE();                                                \
+}
+
 #define BROADCAST(VTYPE, NAME, STYPE)                 \
 static FORCEINLINE VTYPE __broadcast_##NAME(VTYPE v, int index) {   \
     VTYPE ret;                                        \
@@ -584,7 +597,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec64_i1 *p, __vec64_i1 v
     *ptr = v.v;
 }
 
-static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) {
+static FORCEINLINE __vec64_i1 __smear_i1(int v) {
     return __vec64_i1(v, v, v, v, v, v, v, v, 
                       v, v, v, v, v, v, v, v,
                       v, v, v, v, v, v, v, v,
@@ -595,6 +608,21 @@ static FORCEINLINE __vec64_i1 __smear_i1(__vec64_i1, int v) {
                       v, v, v, v, v, v, v, v);
 }
 
+static FORCEINLINE __vec64_i1 __setzero_i1() {
+    return __vec64_i1(0, 0, 0, 0, 0, 0, 0, 0, 
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static FORCEINLINE __vec64_i1 __undef_i1() {
+    return __vec64_i1();
+}
+
 
 ///////////////////////////////////////////////////////////////////////////
 // int8
@@ -634,6 +662,8 @@ CMP_OP(__vec64_i8, int8_t,  __signed_greater_than, >)
 SELECT(__vec64_i8)
 INSERT_EXTRACT(__vec64_i8, int8_t)
 SMEAR(__vec64_i8, i8, int8_t)
+SETZERO(__vec64_i8, i8)
+UNDEF(__vec64_i8, i8)
 BROADCAST(__vec64_i8, i8, int8_t)
 ROTATE(__vec64_i8, i8, int8_t)
 SHUFFLES(__vec64_i8, i8, int8_t)
@@ -677,6 +707,8 @@ CMP_OP(__vec64_i16, int16_t,  __signed_greater_than, >)
 SELECT(__vec64_i16)
 INSERT_EXTRACT(__vec64_i16, int16_t)
 SMEAR(__vec64_i16, i16, int16_t)
+SETZERO(__vec64_i16, i16)
+UNDEF(__vec64_i16, i16)
 BROADCAST(__vec64_i16, i16, int16_t)
 ROTATE(__vec64_i16, i16, int16_t)
 SHUFFLES(__vec64_i16, i16, int16_t)
@@ -720,6 +752,8 @@ CMP_OP(__vec64_i32, int32_t,  __signed_greater_than, >)
 SELECT(__vec64_i32)
 INSERT_EXTRACT(__vec64_i32, int32_t)
 SMEAR(__vec64_i32, i32, int32_t)
+SETZERO(__vec64_i32, i32)
+UNDEF(__vec64_i32, i32)
 BROADCAST(__vec64_i32, i32, int32_t)
 ROTATE(__vec64_i32, i32, int32_t)
 SHUFFLES(__vec64_i32, i32, int32_t)
@@ -763,6 +797,8 @@ CMP_OP(__vec64_i64, int64_t,  __signed_greater_than, >)
 SELECT(__vec64_i64)
 INSERT_EXTRACT(__vec64_i64, int64_t)
 SMEAR(__vec64_i64, i64, int64_t)
+SETZERO(__vec64_i64, i64)
+UNDEF(__vec64_i64, i64)
 BROADCAST(__vec64_i64, i64, int64_t)
 ROTATE(__vec64_i64, i64, int64_t)
 SHUFFLES(__vec64_i64, i64, int64_t)
@@ -798,6 +834,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
 SELECT(__vec64_f)
 INSERT_EXTRACT(__vec64_f, float)
 SMEAR(__vec64_f, float, float)
+SETZERO(__vec64_f, float)
+UNDEF(__vec64_f, float)
 BROADCAST(__vec64_f, float, float)
 ROTATE(__vec64_f, float, float)
 SHUFFLES(__vec64_f, float, float)
@@ -948,6 +986,8 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
 SELECT(__vec64_d)
 INSERT_EXTRACT(__vec64_d, double)
 SMEAR(__vec64_d, double, double)
+SETZERO(__vec64_d, double)
+UNDEF(__vec64_d, double)
 BROADCAST(__vec64_d, double, double)
 ROTATE(__vec64_d, double, double)
 SHUFFLES(__vec64_d, double, double)
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 9cc6ef22..bcc89bf4 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -477,10 +477,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
     *ptr = v.m;
 }
 
-static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int i) {
+static FORCEINLINE __vec16_i1 __smear_i1(int i) {
     return i?0xFFFF:0x0;
 }
 
+static FORCEINLINE __vec16_i1 __setzero_i1() {
+    return 0;
+}
+
+static FORCEINLINE __vec16_i1 __undef_i1() {
+    return __vec16_i1(); // FIXME? __mm512_undef_mask();
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // int8
 
@@ -686,10 +694,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b)
 static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }
 
-static FORCEINLINE __vec16_i32 __smear_i32(__vec16_i32, int32_t i) {
+static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) {
     return _mm512_set_1to16_epi32(i);
 }
 
+static FORCEINLINE __vec16_i32 __setzero_i32() {
+    return _mm512_setzero_epi32();
+}
+
+static FORCEINLINE __vec16_i32 __undef_i32() {
+    return _mm512_undefined_epi32();
+}
+
 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
     int32_t val = __extract_element(v, index & 0xf);
     return _mm512_set_1to16_epi32(val);
@@ -966,10 +982,18 @@ static FORCEINLINE void  __insert_element(__vec16_f *v, int index, float val) {
     ((float *)v)[index] = val;
 }
 
-static FORCEINLINE __vec16_f __smear_float(__vec16_f, float f) {
+static FORCEINLINE __vec16_f __smear_float(float f) {
     return _mm512_set_1to16_ps(f);
 }
 
+static FORCEINLINE __vec16_f __setzero_float() {
+    return _mm512_setzero_ps();
+}
+
+static FORCEINLINE __vec16_f __undef_float() {
+    return _mm512_undefined_ps();
+}
+
 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
     int32_t val = __extract_element(v, index & 0xf);
     return _mm512_set_1to16_ps(val);
@@ -1116,13 +1140,27 @@ static FORCEINLINE void  __insert_element(__vec16_d *v, int index, double val) {
     ((double *)v)[index] = val;
 }
 
-static FORCEINLINE __vec16_d __smear_double(__vec16_d, double d) {
+static FORCEINLINE __vec16_d __smear_double(double d) {
     __vec16_d ret;
     ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
     ret.v2 = ret.v1;
     return ret;
 }
 
+static FORCEINLINE __vec16_d __setzero_double() {
+    __vec16_d ret;
+    ret.v1 = _mm512_setzero_pd();
+    ret.v2 = ret.v1;
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __undef_double() {
+    __vec16_d ret;
+    ret.v1 = _mm512_undefined_pd();
+    ret.v2 = ret.v1;
+    return ret;
+}
+
 static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
     __vec16_d ret;
     int32_t val = __extract_element(v, index & 0xf);
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 736e5d2d..57a483c6 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -297,10 +297,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec4_i1 *p, __vec4_i1 val
     _mm_storeu_ps((float *)(&p->v), value.v);
 }
 
-static FORCEINLINE __vec4_i1 __smear_i1(__vec4_i1, int v) {
+static FORCEINLINE __vec4_i1 __smear_i1(int v) {
     return __vec4_i1(v, v, v, v);
 }
 
+static FORCEINLINE __vec4_i1 __setzero_i1() {
+    return __vec4_i1(_mm_setzero_ps());
+}
+
+static FORCEINLINE __vec4_i1 __undef_i1() {
+    return __vec4_i1();
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // int8
 
@@ -524,10 +532,18 @@ static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
     ((int8_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i8 __smear_i8(__vec4_i8, int8_t v) {
+static FORCEINLINE __vec4_i8 __smear_i8(int8_t v) {
     return _mm_set1_epi8(v);
 }
 
+static FORCEINLINE __vec4_i8 __setzero_i8() {
+    return _mm_set1_epi8(0);
+}
+
+static FORCEINLINE __vec4_i8 __undef_i8() {
+    return __vec4_i8();
+}
+
 static FORCEINLINE __vec4_i8 __broadcast_i8(__vec4_i8 v, int index) {
     return _mm_set1_epi8(__extract_element(v, index));
 }
@@ -783,10 +799,18 @@ static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val)
     ((int16_t *)v)[index] = val;
 }
 
-static FORCEINLINE __vec4_i16 __smear_i16(__vec4_i16, int16_t v) {
+static FORCEINLINE __vec4_i16 __smear_i16(int16_t v) {
     return _mm_set1_epi16(v);
 }
 
+static FORCEINLINE __vec4_i16 __setzero_i16() {
+    return _mm_set1_epi16(0);
+}
+
+static FORCEINLINE __vec4_i16 __undef_i16() {
+    return __vec4_i16();
+}
+
 static FORCEINLINE __vec4_i16 __broadcast_i16(__vec4_i16 v, int index) {
     return _mm_set1_epi16(__extract_element(v, index));
 }
@@ -1020,10 +1044,18 @@ static FORCEINLINE __vec4_i32 __select(__vec4_i1 mask, __vec4_i32 a, __vec4_i32
                                           _mm_castsi128_ps(a.v), mask.v));
 }
 
-static FORCEINLINE __vec4_i32 __smear_i32(__vec4_i32, int32_t v) {
+static FORCEINLINE __vec4_i32 __smear_i32(int32_t v) {
     return _mm_set1_epi32(v);
 }
 
+static FORCEINLINE __vec4_i32 __setzero_i32() {
+    return _mm_castps_si128(_mm_setzero_ps());
+}
+
+static FORCEINLINE __vec4_i32 __undef_i32() {
+    return __vec4_i32();
+}
+
 static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
     return ((int32_t *)&v)[index];
 }
@@ -1281,10 +1313,18 @@ static FORCEINLINE __vec4_i64 __select(__vec4_i1 mask, __vec4_i64 a, __vec4_i64
     return __vec4_i64(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
 }
 
-static FORCEINLINE __vec4_i64 __smear_i64(__vec4_i64, int64_t v) {
+static FORCEINLINE __vec4_i64 __smear_i64(int64_t v) {
     return __vec4_i64(v, v, v, v);
 }
 
+static FORCEINLINE __vec4_i64 __setzero_i64() {
+    return __vec4_i64(0, 0, 0, 0);
+}
+
+static FORCEINLINE __vec4_i64 __undef_i64() {
+    return __vec4_i64();
+}
+
 static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
     return ((int64_t *)&v)[index];
 }
@@ -1385,10 +1425,18 @@ static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
 
-static FORCEINLINE __vec4_f __smear_float(__vec4_f, float v) {
+static FORCEINLINE __vec4_f __smear_float(float v) {
     return _mm_set1_ps(v);
 }
 
+static FORCEINLINE __vec4_f __setzero_float() {
+    return _mm_setzero_ps();
+}
+
+static FORCEINLINE __vec4_f __undef_float() {
+    return __vec4_f();
+}
+
 static FORCEINLINE float __extract_element(__vec4_f v, int index) {
     return ((float *)&v)[index];
 }
@@ -1517,10 +1565,18 @@ static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
     return __vec4_d(r0, r1);
 }
 
-static FORCEINLINE __vec4_d __smear_double(__vec4_d, double v) {
+static FORCEINLINE __vec4_d __smear_double(double v) {
     return __vec4_d(_mm_set1_pd(v), _mm_set1_pd(v));
 }
 
+static FORCEINLINE __vec4_d __setzero_double() {
+    return __vec4_d(_mm_setzero_pd(), _mm_setzero_pd());
+}
+
+static FORCEINLINE __vec4_d __undef_double() {
+    return __vec4_d();
+}
+
 static FORCEINLINE double __extract_element(__vec4_d v, int index) {
     return ((double *)&v)[index];
 }
@@ -1617,13 +1673,11 @@ static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_sext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(__vec4_i8(), 0xff), 
-                       __smear_i8(__vec4_i8(), 0));
+    return __select(v, __smear_i8(0xff), __setzero_i8());
 }
 
 static FORCEINLINE __vec4_i16 __cast_sext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(__vec4_i16(), 0xffff),
-                       __smear_i16(__vec4_i16(), 0));
+    return __select(v, __smear_i16(0xffff), __setzero_i16());
 }
 
 static FORCEINLINE __vec4_i32 __cast_sext(__vec4_i32, __vec4_i1 v) {
@@ -1683,12 +1737,11 @@ static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i8 val) {
 }
 
 static FORCEINLINE __vec4_i8 __cast_zext(__vec4_i8, __vec4_i1 v) {
-    return __select(v, __smear_i8(__vec4_i8(), 1), __smear_i8(__vec4_i8(), 0));
+    return __select(v, __smear_i8(1), __setzero_i8());
 }
 
 static FORCEINLINE __vec4_i16 __cast_zext(__vec4_i16, __vec4_i1 v) {
-    return __select(v, __smear_i16(__vec4_i16(), 1), 
-                       __smear_i16(__vec4_i16(), 0));
+    return __select(v, __smear_i16(1), __setzero_i16());
 }
 
 static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
@@ -1696,7 +1749,7 @@ static FORCEINLINE __vec4_i32 __cast_zext(__vec4_i32, __vec4_i1 v) {
 }
 
 static FORCEINLINE __vec4_i64 __cast_zext(__vec4_i64, __vec4_i1 v) {
-    return __select(v, __smear_i64(__vec4_i64(), 1), __smear_i64(__vec4_i64(), 0));
+    return __select(v, __smear_i64(1), __setzero_i64());
 }
 
 // truncations
@@ -1856,11 +1909,11 @@ static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i64 val) {
 }
 
 static FORCEINLINE __vec4_f __cast_uitofp(__vec4_f, __vec4_i1 v) {
-    return __select(v, __smear_float(__vec4_f(), 1.), __smear_float(__vec4_f(), 0.));
+    return __select(v, __smear_float(1.), __setzero_float());
 }
 
 static FORCEINLINE __vec4_d __cast_uitofp(__vec4_d, __vec4_i1 v) {
-    return __select(v, __smear_double(__vec4_d(), 1.), __smear_double(__vec4_d(), 0.));
+    return __select(v, __smear_double(1.), __setzero_double());
 }
 
 // float/double to signed int
@@ -2795,8 +2848,8 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i32(__vec4_i32(), 0));
-    constOffset = __select(mask, constOffset, __smear_i32(__vec4_i32(), 0));
+    offsets = __select(mask, offsets, __setzero_i32());
+    constOffset = __select(mask, constOffset, __setzero_i32());
 
     int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
     RetScalar *ptr = (RetScalar *)(p + offset);
@@ -2853,8 +2906,8 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
     RetScalar r[4];
 #if 1
     // "Fast gather" trick...
-    offsets = __select(mask, offsets, __smear_i64(__vec4_i64(), 0));
-    constOffset = __select(mask, constOffset, __smear_i64(__vec4_i64(), 0));
+    offsets = __select(mask, offsets, __setzero_i64());
+    constOffset = __select(mask, constOffset, __setzero_i64());
 
     int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
     RetScalar *ptr = (RetScalar *)(p + offset);