More varied support for constant vectors from C++ backend.

If we have a vector of all zeros, a __setzero_* function call is emitted, permitting calling specialized intrinsics for this. Undefined values are reflected with an __undef_* call, which similarly allows passing that information along. This change also includes a cleanup to the signature of the __smear_* functions; since they already have different names depending on the scalar value type, we don't need to use the trick of passing an undefined value of the return vector type as the first parameter as an indirect way to overload by return value. Issue #317.
2012-07-05 20:19:11 -07:00
parent ac421f68e2
commit 0d3993fa25
7 changed files with 330 additions and 104 deletions
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -477,10 +477,18 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v
    *ptr = v.m;
 }

-static FORCEINLINE __vec16_i1 __smear_i1(__vec16_i1, int i) {
+static FORCEINLINE __vec16_i1 __smear_i1(int i) {
    return i?0xFFFF:0x0;
 }

+static FORCEINLINE __vec16_i1 __setzero_i1() {
+    return 0;
+}
+
+static FORCEINLINE __vec16_i1 __undef_i1() {
+    return __vec16_i1(); // FIXME? __mm512_undef_mask();
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // int8

@@ -686,10 +694,18 @@ static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b)
 static FORCEINLINE int32_t __extract_element(__vec16_i32 v, int index) { return ((int32_t *)&v)[index]; }
 static FORCEINLINE void __insert_element(__vec16_i32 *v, int index, int32_t val) { ((int32_t *)v)[index] = val; }

-static FORCEINLINE __vec16_i32 __smear_i32(__vec16_i32, int32_t i) {
+static FORCEINLINE __vec16_i32 __smear_i32(int32_t i) {
    return _mm512_set_1to16_epi32(i);
 }

+static FORCEINLINE __vec16_i32 __setzero_i32() {
+    return _mm512_setzero_epi32();
+}
+
+static FORCEINLINE __vec16_i32 __undef_i32() {
+    return _mm512_undefined_epi32();
+}
+
 static FORCEINLINE __vec16_i32 __broadcast_i32(__vec16_i32 v, int index) {
    int32_t val = __extract_element(v, index & 0xf);
    return _mm512_set_1to16_epi32(val);
@@ -966,10 +982,18 @@ static FORCEINLINE void  __insert_element(__vec16_f *v, int index, float val) {
    ((float *)v)[index] = val;
 }

-static FORCEINLINE __vec16_f __smear_float(__vec16_f, float f) {
+static FORCEINLINE __vec16_f __smear_float(float f) {
    return _mm512_set_1to16_ps(f);
 }

+static FORCEINLINE __vec16_f __setzero_float() {
+    return _mm512_setzero_ps();
+}
+
+static FORCEINLINE __vec16_f __undef_float() {
+    return _mm512_undefined_ps();
+}
+
 static FORCEINLINE __vec16_f __broadcast_float(__vec16_f v, int index) {
    int32_t val = __extract_element(v, index & 0xf);
    return _mm512_set_1to16_ps(val);
@@ -1116,13 +1140,27 @@ static FORCEINLINE void  __insert_element(__vec16_d *v, int index, double val) {
    ((double *)v)[index] = val;
 }

-static FORCEINLINE __vec16_d __smear_double(__vec16_d, double d) {
+static FORCEINLINE __vec16_d __smear_double(double d) {
    __vec16_d ret;
    ret.v1 = _mm512_extload_pd(&d, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
    ret.v2 = ret.v1;
    return ret;
 }

+static FORCEINLINE __vec16_d __setzero_double() {
+    __vec16_d ret;
+    ret.v1 = _mm512_setzero_pd();
+    ret.v2 = ret.v1;
+    return ret;
+}
+
+static FORCEINLINE __vec16_d __undef_double() {
+    __vec16_d ret;
+    ret.v1 = _mm512_undefined_pd();
+    ret.v2 = ret.v1;
+    return ret;
+}
+
 static FORCEINLINE __vec16_d __broadcast_double(__vec16_d v, int index) {
    __vec16_d ret;
    int32_t val = __extract_element(v, index & 0xf);