From 1324e6cdd576b1c87d05811084db1406f9856164 Mon Sep 17 00:00:00 2001 From: Andrey Shishpanov Date: Fri, 12 Feb 2016 18:22:48 +0300 Subject: [PATCH] added SKX target definition --- Makefile | 2 +- alloy.py | 16 +++--- builtins.cpp | 17 +++++++ builtins/target-avx512-common.ll | 14 ------ builtins/target-knl.ll | 19 +++++++ builtins/target-skx.ll | 85 ++++++++++++++++++++++++++++++++ ispc.cpp | 69 +++++++++++++++++++++++--- ispc.h | 2 +- module.cpp | 4 +- 9 files changed, 195 insertions(+), 33 deletions(-) create mode 100644 builtins/target-skx.ll diff --git a/Makefile b/Makefile index df7608a7..c84e2e6b 100644 --- a/Makefile +++ b/Makefile @@ -202,7 +202,7 @@ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \ sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \ - generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 knl + generic-4 generic-8 generic-16 generic-32 generic-64 generic-1 knl skx ifneq ($(ARM_ENABLED), 0) TARGETS+=neon-32 neon-16 neon-8 endif diff --git a/alloy.py b/alloy.py index b46a0553..5719fb18 100755 --- a/alloy.py +++ b/alloy.py @@ -343,12 +343,12 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, def unsupported_llvm_targets(LLVM_VERSION): - prohibited_list = {"3.2":["avx512knl-i32x16"], - "3.3":["avx512knl-i32x16"], - "3.4":["avx512knl-i32x16"], - "3.5":["avx512knl-i32x16"], - "3.6":["avx512knl-i32x16"], - "3.7":[], + prohibited_list = {"3.2":["avx512knl-i32x16", "avx512skx-i32x16"], + "3.3":["avx512knl-i32x16", "avx512skx-i32x16"], + "3.4":["avx512knl-i32x16", "avx512skx-i32x16"], + "3.5":["avx512knl-i32x16", "avx512skx-i32x16"], + "3.6":["avx512knl-i32x16", "avx512skx-i32x16"], + "3.7":["avx512skx-i32x16"], "3.8":[], "3.9":[], "trunk":[]} @@ -379,7 +379,7 @@ def check_targets(): KNL = ["knl-generic", "avx512knl-i32x16"] targets = [["AVX2", AVX2, False], ["AVX1.1", AVX11, False], ["AVX", AVX, False], ["SSE4", SSE4, False], - ["SSE2", SSE2, False], ["KNL", KNL, False]] + ["SSE2", SSE2, False], ["KNL", KNL, False], ["SKX", SKX, False]] f_lines = take_lines("check_isa.exe", "first") for i in range(0,5): if targets[i][0] in f_lines: @@ -403,6 +403,8 @@ def check_targets(): # here we have SDE f_lines = take_lines(sde_exists + " -help", "all") for i in range(0,len(f_lines)): + if targets[6][2] == False and "skx" in f_lines[i]: + answer_sde = answer_sde + ["-skx", "avx512skx-i32x16"] if targets[5][2] == False and "knl" in f_lines[i]: answer_sde = answer_sde + [["-knl", "knl-generic"], ["-knl", "avx512knl-i32x16"]] if targets[3][2] == False and "wsm" in f_lines[i]: diff --git a/builtins.cpp b/builtins.cpp index d65882c1..dd273136 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -1377,6 +1377,23 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } break; } +#endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + case Target::SKX_AVX512: { + switch (g->target->getVectorWidth()) { + case 16: + if (runtime32) { + EXPORT_MODULE(builtins_bitcode_skx_32bit); + } + else { + EXPORT_MODULE(builtins_bitcode_skx_64bit); + } + break; + default: + FATAL("logic error in DefineStdlib"); + } + break; + } #endif case Target::GENERIC: { switch (g->target->getVectorWidth()) { diff --git a/builtins/target-avx512-common.ll b/builtins/target-avx512-common.ll index 6a4341de..8a60d90e 100644 --- a/builtins/target-avx512-common.ll +++ b/builtins/target-avx512-common.ll @@ -511,13 +511,6 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { ret float %half_scale } -declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { - %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8) - ret <16 x float> %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp @@ -538,13 +531,6 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { ret float %iv_mul } -declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { - %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8) - ret <16 x float> %res -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; sqrt diff --git a/builtins/target-knl.ll b/builtins/target-knl.ll index 07037b86..799ed500 100644 --- a/builtins/target-knl.ll +++ b/builtins/target-knl.ll @@ -40,5 +40,24 @@ ifelse(LLVM_VERSION, LLVM_3_7, `include(`target-avx512-common.ll')' ) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %0, <16 x float> undef, i16 -1, i32 8) + ret <16 x float> %res +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %v, <16 x float> undef, i16 -1, i32 8) + ret <16 x float> %res +} ;;saturation_arithmetic_novec() diff --git a/builtins/target-skx.ll b/builtins/target-skx.ll new file mode 100644 index 00000000..e8929894 --- /dev/null +++ b/builtins/target-skx.ll @@ -0,0 +1,85 @@ +;; Copyright (c) 2016, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +define(`WIDTH',`16') + + +ifelse(LLVM_VERSION, LLVM_3_8, + `include(`target-avx512-common.ll')', + LLVM_VERSION, LLVM_3_9, + `include(`target-avx512-common.ll')' + ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + + +declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <16 x float> @__rcp_varying_float(<16 x float>) nounwind readonly alwaysinline { + %call = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %0, <16 x float> undef, i16 -1) + ; do one Newton-Raphson iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + %v_iv = fmul <16 x float> %0, %call + %two_minus = fsub <16 x float> , %v_iv + %iv_mul = fmul <16 x float> %call, %two_minus + ret <16 x float> %iv_mul +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + +declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone + +define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline { + %is = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %v, <16 x float> undef, i16 -1) + ; Newton-Raphson iteration to improve precision + ; float is = __rsqrt_v(v); + ; return 0.5 * is * (3. - (v * is) * is); + %v_is = fmul <16 x float> %v, %is + %v_is_is = fmul <16 x float> %v_is, %is + %three_sub = fsub <16 x float> , %v_is_is + %is_mul = fmul <16 x float> %is, %three_sub + %half_scale = fmul <16 x float> , %is_mul + ret <16 x float> %half_scale +} + +;;saturation_arithmetic_novec() diff --git a/ispc.cpp b/ispc.cpp index 14bb1f02..a848d5a3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -165,7 +165,7 @@ lGetSystemISA() { (info2[1] & (1 << 28)) != 0 && // AVX512 CDI (info2[1] & (1 << 30)) != 0 && // AVX512 BW (info2[1] & (1 << 31)) != 0) { // AVX512 VL - return "skx"; + return "avx512skx-i32x16"; } else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF (info2[1] & (1 << 27)) != 0 && // AVX512 ER @@ -239,10 +239,24 @@ typedef enum { #endif #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+ - // KNL. Supports AVX512. + // Knights Landing - Xeon Phi. + // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ; + // AVX-512CDI: Conflict Detection; + // AVX-512ERI & PRI: 28-bit precision RCP, RSQRT and EXP transcendentals, + // new prefetch instructions. CPU_KNL, #endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + // Skylake Xeon. + // Supports AVX-512F: All the key AVX-512 features: masking, broadcast... ; + // AVX-512CDI: Conflict Detection; + // AVX-512VL: Vector Length Orthogonality; + // AVX-512DQ: New HPC ISA (vs AVX512F); + // AVX-512BW: Byte and Word Support. + CPU_SKX, +#endif + #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+ // Late Atom-like design. Supports SSE 4.2 + POPCNT/LZCNT. CPU_Silvermont, @@ -327,6 +341,10 @@ public: names[CPU_KNL].push_back("knl"); #endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + names[CPU_SKX].push_back("skx"); +#endif + #ifdef ISPC_ARM_ENABLED names[CPU_CortexA15].push_back("cortex-a15"); @@ -353,6 +371,13 @@ public: CPU_Haswell, CPU_Broadwell, CPU_None); #endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + compat[CPU_SKX] = Set(CPU_SKX, CPU_Bonnell, CPU_Penryn, + CPU_Core2, CPU_Nehalem, CPU_Silvermont, + CPU_SandyBridge, CPU_IvyBridge, + CPU_Haswell, CPU_Broadwell, CPU_None); +#endif + #if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // LLVM 3.2, 3.3, 3.4 or 3.5 #define CPU_Broadwell CPU_Haswell #else /* LLVM 3.6+ */ @@ -513,6 +538,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo break; #endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + case CPU_SKX: + isa = "avx512skx-i32x16"; + break; +#endif + #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6 case CPU_Broadwell: #endif @@ -915,7 +946,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo CPUfromISA = CPU_KNL; } #endif - +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + else if (!strcasecmp(isa, "avx512skx-i32x16")) { + this->m_isa = Target::SKX_AVX512; + this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 64; + // ?? this->m_dataTypeWidth = 32; + this->m_vectorWidth = 16; + this->m_maskingIsFree = true; + this->m_maskBitCount = 1; + this->m_hasHalf = true; + this->m_hasRand = true; + this->m_hasGather = this->m_hasScatter = true; + this->m_hasTranscendentals = false; + // For MIC it is set to true due to performance reasons. The option should be tested. + this->m_hasTrigonometry = false; + this->m_hasRsqrtd = this->m_hasRcpd = false; + this->m_hasVecPrefetch = false; + CPUfromISA = CPU_SKX; + } +#endif #ifdef ISPC_ARM_ENABLED else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; @@ -1144,6 +1194,9 @@ Target::SupportedTargets() { "avx2-i32x8, avx2-i32x16, avx2-i64x4, " #if ISPC_LLVM_VERSION >= ISPC_LLVM_3_7 // LLVM 3.7+ "avx512knl-i32x16, " +#endif +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + "avx512skx-i32x16, " #endif "generic-x1, generic-x4, generic-x8, generic-x16, " "generic-x32, generic-x64, *-generic-x16, " @@ -1219,8 +1272,8 @@ Target::ISAToString(ISA isa) { case Target::KNL_AVX512: return "avx512knl"; #endif - case Target::SKX: - return "skx"; + case Target::SKX_AVX512: + return "avx512skx"; case Target::GENERIC: return "generic"; #ifdef ISPC_NVPTX_ENABLED @@ -1267,8 +1320,10 @@ Target::ISAToTargetString(ISA isa) { case Target::KNL_AVX512: return "avx512knl-i32x16"; #endif - case Target::SKX: - return "avx2"; +#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_8 // LLVM 3.8+ + case Target::SKX_AVX512: + return "avx512skx-i32x16"; +#endif case Target::GENERIC: return "generic-4"; #ifdef ISPC_NVPTX_ENABLED diff --git a/ispc.h b/ispc.h index fd0b7c20..e66c0e62 100644 --- a/ispc.h +++ b/ispc.h @@ -193,7 +193,7 @@ public: AVX11 = 3, AVX2 = 4, KNL_AVX512 = 5, - SKX = 6, + SKX_AVX512 = 6, GENERIC = 7, #ifdef ISPC_NVPTX_ENABLED NVPTX, diff --git a/module.cpp b/module.cpp index 830cd99a..e935be17 100644 --- a/module.cpp +++ b/module.cpp @@ -2809,10 +2809,8 @@ lCreateDispatchFunction(llvm::Module *module, llvm::Function *setISAFunc, !g->target->getTreatGenericAsSmth().empty()) { if (g->target->getTreatGenericAsSmth() == "knl_generic") dispatchNum = Target::KNL_AVX512; - else if (g->target->getTreatGenericAsSmth() == "skx_generic") - dispatchNum = Target::SKX; else { - Error(SourcePos(), "*-generic target can be called only with knl or skx"); + Error(SourcePos(), "*-generic target can be called only with knl"); exit(1); } }