From bc7775aef227308ff8c3a8704ce3f2e4cc95f029 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 9 Jul 2012 14:35:51 -0700 Subject: [PATCH] Fix __ordered and _unordered floating point functions for C++ target. Fixes include adding "_float" and "_double" suffixes as appropriate as well as providing a number of missing implementations. This fixes a number of failures in the half* tests. --- cbackend.cpp | 2 +- examples/intrinsics/generic-16.h | 20 ++++++++++++++++++-- examples/intrinsics/generic-32.h | 20 ++++++++++++++++++-- examples/intrinsics/generic-64.h | 20 ++++++++++++++++++-- examples/intrinsics/knc.h | 30 +++++++++++++++--------------- examples/intrinsics/sse4.h | 11 +++++++++++ 6 files changed, 81 insertions(+), 22 deletions(-) diff --git a/cbackend.cpp b/cbackend.cpp index 1acd24b4..152caf6e 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -3261,7 +3261,7 @@ lPredicateToString(llvm::CmpInst::Predicate p) { case llvm::ICmpInst::ICMP_SGT: return "__signed_greater_than"; case llvm::FCmpInst::FCMP_ORD: return "__ordered"; - case llvm::FCmpInst::FCMP_UNO: return "__cmpunord"; + case llvm::FCmpInst::FCMP_UNO: return "__unordered"; case llvm::FCmpInst::FCMP_UEQ: return "__equal"; case llvm::FCmpInst::FCMP_UNE: return "__not_equal"; case llvm::FCmpInst::FCMP_ULT: return "__less_than"; diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index f0a59542..f1fba1c9 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -618,7 +618,7 @@ CMP_OP(__vec16_f, float, float, __less_equal, <=) CMP_OP(__vec16_f, float, float, __greater_than, >) CMP_OP(__vec16_f, float, float, __greater_equal, >=) -static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { __vec16_i1 ret; ret.v = 0; for (int i = 0; i < 16; ++i) @@ -626,6 +626,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { return ret; } +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif @@ -770,7 +778,7 @@ CMP_OP(__vec16_d, double, double, __less_equal, <=) CMP_OP(__vec16_d, double, double, __greater_than, >) CMP_OP(__vec16_d, double, double, __greater_equal, >=) -static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; ret.v = 0; for (int i = 0; i < 16; ++i) @@ -778,6 +786,14 @@ static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { return ret; } +static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret; + ret.v = 0; + for (int i = 0; i < 16; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index e3215e03..57973ddc 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -686,7 +686,7 @@ CMP_OP(__vec32_f, float, float, __less_equal, <=) CMP_OP(__vec32_f, float, float, __greater_than, >) CMP_OP(__vec32_f, float, float, __greater_equal, >=) -static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) { +static FORCEINLINE __vec32_i1 __ordered_float(__vec32_f a, __vec32_f b) { __vec32_i1 ret; ret.v = 0; for (int i = 0; i < 32; ++i) @@ -694,6 +694,14 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_f a, __vec32_f b) { return ret; } +static FORCEINLINE __vec32_i1 __unordered_float(__vec32_f a, __vec32_f b) { + __vec32_i1 ret; + ret.v = 0; + for (int i = 0; i < 32; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif @@ -838,7 +846,7 @@ CMP_OP(__vec32_d, double, double, __less_equal, <=) CMP_OP(__vec32_d, double, double, __greater_than, >) CMP_OP(__vec32_d, double, double, __greater_equal, >=) -static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) { +static FORCEINLINE __vec32_i1 __ordered_double(__vec32_d a, __vec32_d b) { __vec32_i1 ret; ret.v = 0; for (int i = 0; i < 32; ++i) @@ -846,6 +854,14 @@ static FORCEINLINE __vec32_i1 __ordered(__vec32_d a, __vec32_d b) { return ret; } +static FORCEINLINE __vec32_i1 __unordered_double(__vec32_d a, __vec32_d b) { + __vec32_i1 ret; + ret.v = 0; + for (int i = 0; i < 32; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1 << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2ee40608..268e0ab1 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -819,7 +819,7 @@ CMP_OP(__vec64_f, float, float, __less_equal, <=) CMP_OP(__vec64_f, float, float, __greater_than, >) CMP_OP(__vec64_f, float, float, __greater_equal, >=) -static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) { +static FORCEINLINE __vec64_i1 __ordered_float(__vec64_f a, __vec64_f b) { __vec64_i1 ret; ret.v = 0; for (int i = 0; i < 64; ++i) @@ -827,6 +827,14 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_f a, __vec64_f b) { return ret; } +static FORCEINLINE __vec64_i1 __unordered_float(__vec64_f a, __vec64_f b) { + __vec64_i1 ret; + ret.v = 0; + for (int i = 0; i < 64; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1ull << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif @@ -971,7 +979,7 @@ CMP_OP(__vec64_d, double, double, __less_equal, <=) CMP_OP(__vec64_d, double, double, __greater_than, >) CMP_OP(__vec64_d, double, double, __greater_equal, >=) -static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) { +static FORCEINLINE __vec64_i1 __ordered_double(__vec64_d a, __vec64_d b) { __vec64_i1 ret; ret.v = 0; for (int i = 0; i < 64; ++i) @@ -979,6 +987,14 @@ static FORCEINLINE __vec64_i1 __ordered(__vec64_d a, __vec64_d b) { return ret; } +static FORCEINLINE __vec64_i1 __unordered_double(__vec64_d a, __vec64_d b) { + __vec64_i1 ret; + ret.v = 0; + for (int i = 0; i < 64; ++i) + ret.v |= ((a.v[i] != a.v[i]) || (b.v[i] != b.v[i])) ? (1ull << i) : 0; + return ret; +} + #if 0 case Instruction::FRem: intrinsic = "__frem"; break; #endif diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 7bbfa4eb..765c04ad 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -955,16 +955,13 @@ static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { return _mm512_cmpnlt_ps_mask(a, b); } -/* -static FORCEINLINE __vec16_i1 __ordered(__vec16_f a, __vec16_f b) { - __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0; - return ret; +static FORCEINLINE __vec16_i1 __ordered_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpord_ps_mask(a, b); } -*/ +static FORCEINLINE __vec16_i1 __unordered_float(__vec16_f a, __vec16_f b) { + return _mm512_cmpunord_ps_mask(a, b); +} static FORCEINLINE __vec16_f __select(__vec16_i1 mask, __vec16_f a, __vec16_f b) { return _mm512_mask_mov_ps(b, mask, a); @@ -1109,16 +1106,19 @@ static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { return ret; } - -/* -static FORCEINLINE __vec16_i1 __ordered(__vec16_d a, __vec16_d b) { +static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) { __vec16_i1 ret; - ret.v = 0; - for (int i = 0; i < 16; ++i) - ret.v |= ((a.v[i] == a.v[i]) && (b.v[i] == b.v[i])) ? (1 << i) : 0; + ret.m8.m1 = _mm512_cmpord_pd_mask(a.v1, b.v1); + ret.m8.m2 = _mm512_cmpord_pd_mask(a.v2, b.v2); + return ret; +} + +static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) { + __vec16_i1 ret; + ret.m8.m1 = _mm512_cmpunord_pd_mask(a.v1, b.v1); + ret.m8.m2 = _mm512_cmpunord_pd_mask(a.v2, b.v2); return ret; } -*/ static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) { __vec16_d ret; diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 886f3c3f..3e692604 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -1422,6 +1422,10 @@ static FORCEINLINE __vec4_i1 __ordered_float(__vec4_f a, __vec4_f b) { return _mm_cmpord_ps(a.v, b.v); } +static FORCEINLINE __vec4_i1 __unordered_float(__vec4_f a, __vec4_f b) { + return _mm_cmpunord_ps(a.v, b.v); +} + static FORCEINLINE __vec4_f __select(__vec4_i1 mask, __vec4_f a, __vec4_f b) { return _mm_blendv_ps(b.v, a.v, mask.v); } @@ -1556,6 +1560,13 @@ static FORCEINLINE __vec4_i1 __ordered_double(__vec4_d a, __vec4_d b) { _MM_SHUFFLE(2, 0, 2, 0)); } +static FORCEINLINE __vec4_i1 __unordered_double(__vec4_d a, __vec4_d b) { + __m128d cmp0 = _mm_cmpunord_pd(a.v[0], b.v[0]); + __m128d cmp1 = _mm_cmpunord_pd(a.v[1], b.v[1]); + return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1), + _MM_SHUFFLE(2, 0, 2, 0)); +} + static FORCEINLINE __vec4_d __select(__vec4_i1 mask, __vec4_d a, __vec4_d b) { __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0)); __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));