From bc7775aef227308ff8c3a8704ce3f2e4cc95f029 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 9 Jul 2012 14:35:51 -0700
Subject: [PATCH] Fix __ordered and _unordered floating point functions for C++
 target.

Fixes include adding "_float" and "_double" suffixes as appropriate as well
as providing a number of missing implementations.

This fixes a number of failures in the half* tests.
---
 cbackend.cpp                     |  2 +-
 examples/intrinsics/generic-16.h | 20 ++++++++++++++++++--
 examples/intrinsics/generic-32.h | 20 ++++++++++++++++++--
 examples/intrinsics/generic-64.h | 20 ++++++++++++++++++--
 examples/intrinsics/knc.h        | 30 +++++++++++++++---------------
 examples/intrinsics/sse4.h       | 11 +++++++++++
 6 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/cbackend.cpp b/cbackend.cpp
index 1acd24b4..152caf6e 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3261,7 +3261,7 @@ lPredicateToString(llvm::CmpInst::Predicate p) {
     case llvm::ICmpInst::ICMP_SGT: return "__signed_greater_than";
 
     case llvm::FCmpInst::FCMP_ORD: return "__ordered";
-    case llvm::FCmpInst::FCMP_UNO: return "__cmpunord";
+    case llvm::FCmpInst::FCMP_UNO: return "__unordered";
     case llvm::FCmpInst::FCMP_UEQ: return "__equal";
     case llvm::FCmpInst::FCMP_UNE: return "__not_equal";
     case llvm::FCmpInst::FCMP_ULT: return "__less_than";
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index f0a59542..f1fba1c9 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -618,7 +618,7 @@ CMP_OP(__vec16_f, float, float, __less_equal, <=)
 CMP_OP(__vec16_f, float, float, __greater_than, >)
 CMP_OP(__vec16_f, float, float, __greater_equal, >=)
 
-static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
     __vec16_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 16; ++i)
@@ -626,6 +626,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
     return ret;
 }
 
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -770,7 +778,7 @@ CMP_OP(__vec16_d, double, double, __less_equal, <=)
 CMP_OP(__vec16_d, double, double, __greater_than, >)
 CMP_OP(__vec16_d, double, double, __greater_equal, >=)
 
-static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
     __vec16_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 16; ++i)
@@ -778,6 +786,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
     return ret;
 }
 
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 16; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index e3215e03..57973ddc 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -686,7 +686,7 @@ CMP_OP(__vec32_f, float, float, __less_equal, <=)
 CMP_OP(__vec32_f, float, float, __greater_than, >)
 CMP_OP(__vec32_f, float, float, __greater_equal, >=)
 
-static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
+static FORCEINLINE __vec32_i1 __ordered_float(__vec32_f a, __vec32_f b) {
     __vec32_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 32; ++i)
@@ -694,6 +694,14 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) {
     return ret;
 }
 
+static FORCEINLINE __vec32_i1 __unordered_float(__vec32_f a, __vec32_f b) {
+    __vec32_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 32; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -838,7 +846,7 @@ CMP_OP(__vec32_d, double, double, __less_equal, <=)
 CMP_OP(__vec32_d, double, double, __greater_than, >)
 CMP_OP(__vec32_d, double, double, __greater_equal, >=)
 
-static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
+static FORCEINLINE __vec32_i1 __ordered_double(__vec32_d a, __vec32_d b) {
     __vec32_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 32; ++i)
@@ -846,6 +854,14 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) {
     return ret;
 }
 
+static FORCEINLINE __vec32_i1 __unordered_double(__vec32_d a, __vec32_d b) {
+    __vec32_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 32; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 2ee40608..268e0ab1 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -819,7 +819,7 @@ CMP_OP(__vec64_f, float, float, __less_equal, <=)
 CMP_OP(__vec64_f, float, float, __greater_than, >)
 CMP_OP(__vec64_f, float, float, __greater_equal, >=)
 
-static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
+static FORCEINLINE __vec64_i1 __ordered_float(__vec64_f a, __vec64_f b) {
     __vec64_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 64; ++i)
@@ -827,6 +827,14 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) {
     return ret;
 }
 
+static FORCEINLINE __vec64_i1 __unordered_float(__vec64_f a, __vec64_f b) {
+    __vec64_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 64; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1ull << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
@@ -971,7 +979,7 @@ CMP_OP(__vec64_d, double, double, __less_equal, <=)
 CMP_OP(__vec64_d, double, double, __greater_than, >)
 CMP_OP(__vec64_d, double, double, __greater_equal, >=)
 
-static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
+static FORCEINLINE __vec64_i1 __ordered_double(__vec64_d a, __vec64_d b) {
     __vec64_i1 ret;
     ret.v = 0;
     for (int i = 0; i < 64; ++i)
@@ -979,6 +987,14 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) {
     return ret;
 }
 
+static FORCEINLINE __vec64_i1 __unordered_double(__vec64_d a, __vec64_d b) {
+    __vec64_i1 ret;
+    ret.v = 0;
+    for (int i = 0; i < 64; ++i)
+        ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1ull << i) : 0;
+    return ret;
+}
+
 #if 0
       case Instruction::FRem: intrinsic = "__frem"; break;
 #endif
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index 7bbfa4eb..765c04ad 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -955,16 +955,13 @@ static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) {
     return _mm512_cmpnlt_ps_mask(a, b);
 }
 
-/*
-static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) {
-    __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
-    return ret;
+static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpord_ps_mask(a, b);
 }
-*/
 
+static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) {
+    return _mm512_cmpunord_ps_mask(a, b);
+}
 
 static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) {
     return _mm512_mask_mov_ps(b, mask, a);
@@ -1109,16 +1106,19 @@ static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
     return ret;
 }
 
-
-/*
-static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) {
+static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
     __vec16_i1 ret;
-    ret.v = 0;
-    for (int i = 0; i < 16; ++i)
-        ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0;
+    ret.m8.m1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
+    ret.m8.m2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
+    return ret;
+}
+
+static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
+    __vec16_i1 ret;
+    ret.m8.m1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
+    ret.m8.m2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
     return ret;
 }
-*/
 
 static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
     __vec16_d ret;
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index 886f3c3f..3e692604 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -1422,6 +1422,10 @@ static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) {
     return _mm_cmpord_ps(a.v, b.v);
 }
 
+static FORCEINLINE __vec4_i1 __unordered_float(__vec4_f a, __vec4_f b) {
+    return _mm_cmpunord_ps(a.v, b.v);
+}
+
 static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) {
     return _mm_blendv_ps(b.v, a.v, mask.v);
 }
@@ -1556,6 +1560,13 @@ static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) {
                           _MM_SHUFFLE(2, 0, 2, 0));
 }
 
+static FORCEINLINE __vec4_i1 __unordered_double(__vec4_d a, __vec4_d b) {
+    __m128d cmp0 = _mm_cmpunord_pd(a.v[0], b.v[0]);
+    __m128d cmp1 = _mm_cmpunord_pd(a.v[1], b.v[1]);
+    return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
+                          _MM_SHUFFLE(2, 0, 2, 0));
+}
+
 static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) {
     __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
     __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));