From 1324e6cdd576b1c87d05811084db1406f9856164 Mon Sep 17 00:00:00 2001
From: Andrey Shishpanov <shishpanov2010@yandex.ru>
Date: Fri, 12 Feb 2016 18:22:48 +0300
Subject: [PATCH] added SKX target definition

---
 Makefile                         |  2 +-
 alloy.py                         | 16 +++---
 builtins.cpp                     | 17 +++++++
 builtins/target-avx512-common.ll | 14 ------
 builtins/target-knl.ll           | 19 +++++++
 builtins/target-skx.ll           | 85 ++++++++++++++++++++++++++++++++
 ispc.cpp                         | 69 +++++++++++++++++++++++---
 ispc.h                           |  2 +-
 module.cpp                       |  4 +-
 9 files changed, 195 insertions(+), 33 deletions(-)
 create mode 100644 builtins/target-skx.ll

diff --git a/Makefile b/Makefile
index df7608a7..c84e2e6b 100644
--- a/Makefile
+++ b/Makefile
@@ -202,7 +202,7 @@ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
-	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 knl
+	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 knl skx
 ifneq ($(ARM_ENABLED), 0)
     TARGETS+=neon-32 neon-16 neon-8
 endif
diff --git a/alloy.py b/alloy.py
index b46a0553..5719fb18 100755
--- a/alloy.py
+++ b/alloy.py
@@ -343,12 +343,12 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra,
 
 
 def unsupported_llvm_targets(LLVM_VERSION):
-    prohibited_list = {"3.2":["avx512knl-i32x16"],
-                       "3.3":["avx512knl-i32x16"],
-                       "3.4":["avx512knl-i32x16"],
-                       "3.5":["avx512knl-i32x16"],
-                       "3.6":["avx512knl-i32x16"],
-                       "3.7":[],
+    prohibited_list = {"3.2":["avx512knl-i32x16", "avx512skx-i32x16"],
+                       "3.3":["avx512knl-i32x16", "avx512skx-i32x16"],
+                       "3.4":["avx512knl-i32x16", "avx512skx-i32x16"],
+                       "3.5":["avx512knl-i32x16", "avx512skx-i32x16"],
+                       "3.6":["avx512knl-i32x16", "avx512skx-i32x16"],
+                       "3.7":["avx512skx-i32x16"],
                        "3.8":[],
                        "3.9":[],
                        "trunk":[]}   
@@ -379,7 +379,7 @@ def check_targets():
     KNL   = ["knl-generic", "avx512knl-i32x16"]
 
     targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], 
-               ["SSE2", SSE2, False], ["KNL", KNL, False]]
+               ["SSE2", SSE2, False], ["KNL", KNL, False], ["SKX", SKX, False]]
     f_lines = take_lines("check_isa.exe", "first")
     for i in range(0,5):
         if targets[i][0] in f_lines:
@@ -403,6 +403,8 @@ def check_targets():
     # here we have SDE
     f_lines = take_lines(sde_exists + " -help", "all")
     for i in range(0,len(f_lines)):
+        if targets[6][2] == False and "skx" in f_lines[i]:
+            answer_sde = answer_sde + ["-skx", "avx512skx-i32x16"]
         if targets[5][2] == False and "knl" in f_lines[i]:
             answer_sde = answer_sde + [["-knl", "knl-generic"], ["-knl", "avx512knl-i32x16"]]
         if targets[3][2] == False and "wsm" in f_lines[i]:
diff --git a/builtins.cpp b/builtins.cpp
index d65882c1..dd273136 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -1377,6 +1377,23 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         }
         break;
     }
+#endif
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+    case Target::SKX_AVX512: {
+        switch (g->target->getVectorWidth()) {
+        case 16:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_skx_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_skx_64bit);
+            }
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
+    }
 #endif
     case Target::GENERIC: {
         switch (g->target->getVectorWidth()) {
diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll
index 6a4341de..8a60d90e 100644
--- a/builtins/target-avx512-common.ll
+++ b/builtins/target-avx512-common.ll
@@ -511,13 +511,6 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
   ret float %half_scale
 }
 
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %res  
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
@@ -538,13 +531,6 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
   ret float %iv_mul
 }
 
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 
diff --git a/builtins/target-knl.ll b/builtins/target-knl.ll
index 07037b86..799ed500 100644
--- a/builtins/target-knl.ll
+++ b/builtins/target-knl.ll
@@ -40,5 +40,24 @@ ifelse(LLVM_VERSION, LLVM_3_7,
     `include(`target-avx512-common.ll')'
   )
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %res
+}
 
 ;;saturation_arithmetic_novec()
diff --git a/builtins/target-skx.ll b/builtins/target-skx.ll
new file mode 100644
index 00000000..e8929894
--- /dev/null
+++ b/builtins/target-skx.ll
@@ -0,0 +1,85 @@
+;;  Copyright (c) 2016, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+define(`WIDTH',`16')
+
+
+ifelse(LLVM_VERSION, LLVM_3_8,
+    `include(`target-avx512-common.ll')',
+         LLVM_VERSION, LLVM_3_9,
+    `include(`target-avx512-common.ll')'
+  )
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1)
+  ; do one Newton-Raphson iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1)
+  ; Newton-Raphson iteration to improve precision
+  ;  float is = __rsqrt_v(v);
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;saturation_arithmetic_novec()
diff --git a/ispc.cpp b/ispc.cpp
index 14bb1f02..a848d5a3 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -165,7 +165,7 @@ lGetSystemISA() {
             (info2[1] & (1 << 28)) != 0 && // AVX512 CDI
             (info2[1] & (1 << 30)) != 0 && // AVX512 BW
             (info2[1] & (1 << 31)) != 0) { // AVX512 VL
-            return "skx";
+            return "avx512skx-i32x16";
         }
         else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF
                  (info2[1] & (1 << 27)) != 0 && // AVX512 ER
@@ -239,10 +239,24 @@ typedef enum {
 #endif
 
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+
-    // KNL. Supports AVX512.
+    // Knights Landing - Xeon Phi.
+    // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
+    //          AVX-512CDI: Conflict Detection;
+    //          AVX-512ERI & PRI: 28-bit precision RCP, RSQRT and EXP transcendentals,
+    //                            new prefetch instructions.
     CPU_KNL,
 #endif
 
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+    // Skylake Xeon.
+    // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ;
+    //          AVX-512CDI: Conflict Detection;
+    //          AVX-512VL: Vector Length Orthogonality;
+    //          AVX-512DQ: New HPC ISA (vs AVX512F);
+    //          AVX-512BW: Byte and Word Support.
+    CPU_SKX,
+#endif
+
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
     // Late Atom-like design. Supports SSE 4.2 + POPCNT/LZCNT.
     CPU_Silvermont,
@@ -327,6 +341,10 @@ public:
          names[CPU_KNL].push_back("knl");
 #endif
 
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+         names[CPU_SKX].push_back("skx");
+#endif
+
 #ifdef ISPC_ARM_ENABLED
         names[CPU_CortexA15].push_back("cortex-a15");
 
@@ -353,6 +371,13 @@ public:
                                       CPU_Haswell, CPU_Broadwell, CPU_None);
 #endif
 
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+        compat[CPU_SKX]         = Set(CPU_SKX, CPU_Bonnell, CPU_Penryn,
+                                      CPU_Core2, CPU_Nehalem, CPU_Silvermont,
+                                      CPU_SandyBridge, CPU_IvyBridge,
+                                      CPU_Haswell, CPU_Broadwell, CPU_None);
+#endif
+
 #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // LLVM 3.2, 3.3, 3.4 or 3.5
         #define CPU_Broadwell CPU_Haswell
 #else /* LLVM 3.6+ */
@@ -513,6 +538,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
                 break;
 #endif
 
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+            case CPU_SKX:
+                isa = "avx512skx-i32x16";
+                break;
+#endif
+
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6
             case CPU_Broadwell:
 #endif
@@ -915,7 +946,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
         CPUfromISA = CPU_KNL;
     }
 #endif
-
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+    else if (!strcasecmp(isa, "avx512skx-i32x16")) {
+        this->m_isa = Target::SKX_AVX512;
+        this->m_nativeVectorWidth = 16;
+        this->m_nativeVectorAlignment = 64;
+        // ?? this->m_dataTypeWidth = 32;
+        this->m_vectorWidth = 16;
+        this->m_maskingIsFree = true;
+        this->m_maskBitCount = 1;
+        this->m_hasHalf = true;
+        this->m_hasRand = true;
+        this->m_hasGather = this->m_hasScatter = true;
+        this->m_hasTranscendentals = false;
+        // For MIC it is set to true due to performance reasons. The option should be tested.
+        this->m_hasTrigonometry = false;
+        this->m_hasRsqrtd = this->m_hasRcpd = false;
+        this->m_hasVecPrefetch = false;
+        CPUfromISA = CPU_SKX;
+    }
+#endif
 #ifdef ISPC_ARM_ENABLED
     else if (!strcasecmp(isa, "neon-i8x16")) {
         this->m_isa = Target::NEON8;
@@ -1144,6 +1194,9 @@ Target::SupportedTargets() {
         "avx2-i32x8, avx2-i32x16, avx2-i64x4, "
 #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+
         "avx512knl-i32x16, "
+#endif
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+        "avx512skx-i32x16, "
 #endif
         "generic-x1, generic-x4, generic-x8, generic-x16, "
         "generic-x32, generic-x64, *-generic-x16, "
@@ -1219,8 +1272,8 @@ Target::ISAToString(ISA isa) {
     case Target::KNL_AVX512:
         return "avx512knl";
 #endif
-    case Target::SKX:
-        return "skx";
+    case Target::SKX_AVX512:
+        return "avx512skx";
     case Target::GENERIC:
         return "generic";
 #ifdef ISPC_NVPTX_ENABLED
@@ -1267,8 +1320,10 @@ Target::ISAToTargetString(ISA isa) {
     case Target::KNL_AVX512:
         return "avx512knl-i32x16";
 #endif
-    case Target::SKX:
-        return "avx2";
+#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+
+    case Target::SKX_AVX512:
+        return "avx512skx-i32x16";
+#endif
     case Target::GENERIC:
         return "generic-4";
 #ifdef ISPC_NVPTX_ENABLED
diff --git a/ispc.h b/ispc.h
index fd0b7c20..e66c0e62 100644
--- a/ispc.h
+++ b/ispc.h
@@ -193,7 +193,7 @@ public:
         AVX11          = 3,
         AVX2           = 4,
         KNL_AVX512     = 5,
-        SKX            = 6,
+        SKX_AVX512     = 6,
         GENERIC        = 7,
 #ifdef ISPC_NVPTX_ENABLED
         NVPTX,
diff --git a/module.cpp b/module.cpp
index 830cd99a..e935be17 100644
--- a/module.cpp
+++ b/module.cpp
@@ -2809,10 +2809,8 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc,
             !g->target->getTreatGenericAsSmth().empty()) {
             if (g->target->getTreatGenericAsSmth() == "knl_generic")
                 dispatchNum = Target::KNL_AVX512;
-            else if (g->target->getTreatGenericAsSmth() == "skx_generic")
-                dispatchNum = Target::SKX;
             else {
-                Error(SourcePos(), "*-generic target can be called only with knl or skx");
+                Error(SourcePos(), "*-generic target can be called only with knl");
                 exit(1);
             }
         }