From e04d21b997b81b5179ffb746d95d22de4d428cb7 Mon Sep 17 00:00:00 2001 From: Andrey Shishpanov Date: Thu, 7 Apr 2016 15:56:06 +0300 Subject: [PATCH] Replaced SKX patches with cumulative one, added some changes, switched to CMAKE configuration for LLVM 3.8 and newer versions. --- alloy.py | 7 +- .../3_8_r260878_mask_store_size.patch | 263 ------------------ llvm_patches/3_8_r261619-avx512-PCMPGT.patch | 54 ---- llvm_patches/3_8_skx_patch_pack.patch | 248 +++++++++++++++++ 4 files changed, 252 insertions(+), 320 deletions(-) delete mode 100644 llvm_patches/3_8_r260878_mask_store_size.patch delete mode 100644 llvm_patches/3_8_r261619-avx512-PCMPGT.patch create mode 100644 llvm_patches/3_8_skx_patch_pack.patch diff --git a/alloy.py b/alloy.py index 34e3b439..306310fd 100755 --- a/alloy.py +++ b/alloy.py @@ -241,13 +241,14 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, os.makedirs(LLVM_BUILD) os.makedirs(LLVM_BIN) selfbuild_compiler = "" + LLVM_configure_capable = ["3_2", "3_3", "3_4", "3_5", "3_6", "3_7"] if selfbuild: print_debug("Making selfbuild and use folders " + LLVM_BUILD_selfbuild + " and " + LLVM_BIN_selfbuild + "\n", from_validation, alloy_build) os.makedirs(LLVM_BUILD_selfbuild) os.makedirs(LLVM_BIN_selfbuild) os.chdir(LLVM_BUILD_selfbuild) - if version_LLVM == "trunk": + if version_LLVM not in LLVM_configure_capable: # TODO: mac_root try_do_LLVM("configure release version for selfbuild ", "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" + @@ -283,7 +284,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, os.chdir(LLVM_BUILD) if debug == False: if current_OS != "Windows": - if version_LLVM == "trunk": + if version_LLVM not in LLVM_configure_capable: # TODO: mac_root try_do_LLVM("configure release version ", "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" + @@ -311,7 +312,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, '" -DLLVM_LIT_TOOLS_DIR="C:\\gnuwin32\\bin" ..\\' + LLVM_SRC, from_validation) else: - if version_LLVM == "trunk": + if version_LLVM not in LLVM_configure_capable: # TODO: mac_root try_do_LLVM("configure debug version ", "cmake -G Unix\ Makefiles" + " -DCMAKE_EXPORT_COMPILE_COMMANDS=ON" + diff --git a/llvm_patches/3_8_r260878_mask_store_size.patch b/llvm_patches/3_8_r260878_mask_store_size.patch deleted file mode 100644 index 4477d532..00000000 --- a/llvm_patches/3_8_r260878_mask_store_size.patch +++ /dev/null @@ -1,263 +0,0 @@ -# This patch merges slightly modified revisions for SKX capability with LLVM 3.8 -# revisions: r260878, r258867 -Index: test/CodeGen/X86/avx512-intel-ocl.ll -=================================================================== ---- test/CodeGen/X86/avx512-intel-ocl.ll (revision 260877) -+++ test/CodeGen/X86/avx512-intel-ocl.ll (revision 260878) -@@ -68,10 +68,10 @@ - ; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload - - ; X64-LABEL: test_prolog_epilog --; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill --; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill --; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill --; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill -+; X64: kmovq %k7, {{.*}}(%rsp) ## 8-byte Folded Spill -+; X64: kmovq %k6, {{.*}}(%rsp) ## 8-byte Folded Spill -+; X64: kmovq %k5, {{.*}}(%rsp) ## 8-byte Folded Spill -+; X64: kmovq %k4, {{.*}}(%rsp) ## 8-byte Folded Spill - ; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill - ; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill - ; X64: call -Index: test/CodeGen/X86/avx512-mask-spills.ll -=================================================================== ---- test/CodeGen/X86/avx512-mask-spills.ll (revision 0) -+++ test/CodeGen/X86/avx512-mask-spills.ll (revision 260878) -@@ -0,0 +1,126 @@ -+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX -+ -+declare void @f() -+define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { -+; CHECK-LABEL: test_4i1: -+; CHECK: ## BB#0: -+; CHECK-NEXT: pushq %rax -+; CHECK-NEXT: Ltmp0: -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: callq _f -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload -+; CHECK-NEXT: korw %k1, %k0, %k0 -+; CHECK-NEXT: vpmovm2d %k0, %xmm0 -+; CHECK-NEXT: popq %rax -+; CHECK-NEXT: retq -+ -+ %cmp_res = icmp ugt <4 x i32> %a, %b -+ %cmp_res2 = icmp sgt <4 x i32> %a, %b -+ call void @f() -+ %res = or <4 x i1> %cmp_res, %cmp_res2 -+ ret <4 x i1> %res -+} -+ -+define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { -+; CHECK-LABEL: test_8i1: -+; CHECK: ## BB#0: -+; CHECK-NEXT: pushq %rax -+; CHECK-NEXT: Ltmp1: -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: callq _f -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload -+; CHECK-NEXT: korb %k1, %k0, %k0 -+; CHECK-NEXT: vpmovm2w %k0, %xmm0 -+; CHECK-NEXT: popq %rax -+; CHECK-NEXT: retq -+ -+ %cmp_res = icmp ugt <8 x i32> %a, %b -+ %cmp_res2 = icmp sgt <8 x i32> %a, %b -+ call void @f() -+ %res = or <8 x i1> %cmp_res, %cmp_res2 -+ ret <8 x i1> %res -+} -+ -+define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { -+; CHECK-LABEL: test_16i1: -+; CHECK: ## BB#0: -+; CHECK-NEXT: pushq %rax -+; CHECK-NEXT: Ltmp2: -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill -+; CHECK-NEXT: callq _f -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload -+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload -+; CHECK-NEXT: korw %k1, %k0, %k0 -+; CHECK-NEXT: vpmovm2b %k0, %xmm0 -+; CHECK-NEXT: popq %rax -+; CHECK-NEXT: retq -+ %cmp_res = icmp ugt <16 x i32> %a, %b -+ %cmp_res2 = icmp sgt <16 x i32> %a, %b -+ call void @f() -+ %res = or <16 x i1> %cmp_res, %cmp_res2 -+ ret <16 x i1> %res -+} -+ -+define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { -+; CHECK-LABEL: test_32i1: -+; CHECK: ## BB#0: -+; CHECK-NEXT: pushq %rax -+; CHECK-NEXT: Ltmp3: -+; CHECK-NEXT: .cfi_def_cfa_offset 16 -+; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Folded Spill -+; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Folded Spill -+; CHECK-NEXT: callq _f -+; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Folded Reload -+; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Folded Reload -+; CHECK-NEXT: kord %k1, %k0, %k0 -+; CHECK-NEXT: vpmovm2b %k0, %ymm0 -+; CHECK-NEXT: popq %rax -+; CHECK-NEXT: retq -+ %cmp_res = icmp ugt <32 x i16> %a, %b -+ %cmp_res2 = icmp sgt <32 x i16> %a, %b -+ call void @f() -+ %res = or <32 x i1> %cmp_res, %cmp_res2 -+ ret <32 x i1> %res -+} -+ -+define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { -+; CHECK-LABEL: test_64i1: -+; CHECK: ## BB#0: -+; CHECK-NEXT: subq $24, %rsp -+; CHECK-NEXT: Ltmp4: -+; CHECK-NEXT: .cfi_def_cfa_offset 32 -+; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Folded Spill -+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Folded Spill -+; CHECK-NEXT: callq _f -+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Folded Reload -+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Folded Reload -+; CHECK-NEXT: korq %k1, %k0, %k0 -+; CHECK-NEXT: vpmovm2b %k0, %zmm0 -+; CHECK-NEXT: addq $24, %rsp -+; CHECK-NEXT: retq -+ -+ %cmp_res = icmp ugt <64 x i8> %a, %b -+ %cmp_res2 = icmp sgt <64 x i8> %a, %b -+ call void @f() -+ %res = or <64 x i1> %cmp_res, %cmp_res2 -+ ret <64 x i1> %res -+} -Index: lib/Target/X86/X86RegisterInfo.td -=================================================================== ---- lib/Target/X86/X86RegisterInfo.td (revision 260877) -+++ lib/Target/X86/X86RegisterInfo.td (revision 260878) -@@ -477,18 +477,18 @@ - 256, (sequence "YMM%u", 0, 31)>; - - // Mask registers --def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} --def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} --def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} --def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} -+def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} -+def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} -+def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} -+def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} - def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} - def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} - def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} - --def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} --def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} --def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} --def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} -+def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} -+def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} -+def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} -+def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} - def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} - def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} - def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} -Index: lib/Target/X86/X86InstrInfo.cpp -=================================================================== ---- lib/Target/X86/X86InstrInfo.cpp (revision 260877) -+++ lib/Target/X86/X86InstrInfo.cpp (revision 260878) -@@ -4302,12 +4302,14 @@ - return 0; - } - -+static bool isMaskRegClass(const TargetRegisterClass *RC) { -+ // All KMASK RegClasses hold the same k registers, can be tested against anyone. -+ return X86::VK16RegClass.hasSubClassEq(RC); -+} -+ - static bool MaskRegClassContains(unsigned Reg) { -- return X86::VK8RegClass.contains(Reg) || -- X86::VK16RegClass.contains(Reg) || -- X86::VK32RegClass.contains(Reg) || -- X86::VK64RegClass.contains(Reg) || -- X86::VK1RegClass.contains(Reg); -+ // All KMASK RegClasses hold the same k registers, can be tested against anyone. -+ return X86::VK16RegClass.contains(Reg); - } - - static bool GRRegClassContains(unsigned Reg) { -@@ -4509,15 +4511,28 @@ - llvm_unreachable("Cannot emit physreg copy instruction"); - } - -+static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC, -+ bool load) { -+ switch (RC->getSize()) { -+ default: -+ llvm_unreachable("Unknown spill size"); -+ case 2: -+ return load ? X86::KMOVWkm : X86::KMOVWmk; -+ case 4: -+ return load ? X86::KMOVDkm : X86::KMOVDmk; -+ case 8: -+ return load ? X86::KMOVQkm : X86::KMOVQmk; -+ } -+} -+ - static unsigned getLoadStoreRegOpcode(unsigned Reg, - const TargetRegisterClass *RC, - bool isStackAligned, - const X86Subtarget &STI, - bool load) { - if (STI.hasAVX512()) { -- if (X86::VK8RegClass.hasSubClassEq(RC) || -- X86::VK16RegClass.hasSubClassEq(RC)) -- return load ? X86::KMOVWkm : X86::KMOVWmk; -+ if (isMaskRegClass(RC)) -+ return getLoadStoreMaskRegOpcode(RC, load); - if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) - return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; - if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) -Index: lib/Target/X86/X86ISelLowering.cpp -=================================================================== ---- lib/Target/X86/X86ISelLowering.cpp (revision 260877) -+++ lib/Target/X86/X86ISelLowering.cpp (revision 260878) -@@ -12123,10 +12123,15 @@ - } - - unsigned IdxVal = cast(Idx)->getZExtValue(); -- const TargetRegisterClass* rc = getRegClassFor(VecVT); -- if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) -- rc = getRegClassFor(MVT::v16i1); -- unsigned MaxSift = rc->getSize()*8 - 1; -+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) { -+ // Use kshiftlw/rw instruction. -+ VecVT = MVT::v16i1; -+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, -+ DAG.getUNDEF(VecVT), -+ Vec, -+ DAG.getIntPtrConstant(0, dl)); -+ } -+ unsigned MaxSift = VecVT.getVectorNumElements() - 1; - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, - DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, diff --git a/llvm_patches/3_8_r261619-avx512-PCMPGT.patch b/llvm_patches/3_8_r261619-avx512-PCMPGT.patch deleted file mode 100644 index d014897e..00000000 --- a/llvm_patches/3_8_r261619-avx512-PCMPGT.patch +++ /dev/null @@ -1,54 +0,0 @@ -diff -ruN lib/Target/X86/X86InstrInfo.td lib/Target/X86/X86InstrInfo.td ---- lib/Target/X86/X86InstrInfo.td 2016-03-04 12:36:50.461576093 +0300 -+++ lib/Target/X86/X86InstrInfo.td 2016-03-04 12:38:58.747585762 +0300 -@@ -728,6 +728,8 @@ - - //===----------------------------------------------------------------------===// - // X86 Instruction Predicate Definitions. -+def TruePredicate : Predicate<"true">; -+ - def HasCMov : Predicate<"Subtarget->hasCMov()">; - def NoCMov : Predicate<"!Subtarget->hasCMov()">; - -diff -ruN lib/Target/X86/X86InstrSSE.td lib/Target/X86/X86InstrSSE.td ---- lib/Target/X86/X86InstrSSE.td 2016-03-04 12:36:50.472576094 +0300 -+++ lib/Target/X86/X86InstrSSE.td 2016-03-04 12:41:38.419597797 +0300 -@@ -4273,17 +4273,17 @@ - //===---------------------------------------------------------------------===// - - defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, -- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; -+ SSE_INTALU_ITINS_P, 1, TruePredicate>; - defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, -- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; -+ SSE_INTALU_ITINS_P, 1, TruePredicate>; - defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, -- SSE_INTALU_ITINS_P, 1, NoVLX>; -+ SSE_INTALU_ITINS_P, 1, TruePredicate>; - defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, -- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; -+ SSE_INTALU_ITINS_P, 0, TruePredicate>; - defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, -- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; -+ SSE_INTALU_ITINS_P, 0, TruePredicate>; - defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, -- SSE_INTALU_ITINS_P, 0, NoVLX>; -+ SSE_INTALU_ITINS_P, 0, TruePredicate>; - - //===---------------------------------------------------------------------===// - // SSE2 - Packed Integer Shuffle Instructions -diff -ruN test/CodeGen/X86/avx-isa-check.ll test/CodeGen/X86/avx-isa-check.ll ---- test/CodeGen/X86/avx-isa-check.ll 2016-03-04 12:36:45.218575698 +0300 -+++ test/CodeGen/X86/avx-isa-check.ll 2016-03-04 12:44:06.705608973 +0300 -@@ -568,3 +568,11 @@ - %shift = shl <8 x i16> %a, - ret <8 x i16> %shift - } -+ -+define <32 x i8> @test_cmpgtb(<32 x i8> %A) { -+; generate the follow code -+; vpxor %ymm1, %ymm1, %ymm1 -+; vpcmpgtb %ymm0, %ymm1, %ymm0 -+ %B = ashr <32 x i8> %A, -+ ret <32 x i8> %B -+} diff --git a/llvm_patches/3_8_skx_patch_pack.patch b/llvm_patches/3_8_skx_patch_pack.patch new file mode 100644 index 00000000..d32dcd4b --- /dev/null +++ b/llvm_patches/3_8_skx_patch_pack.patch @@ -0,0 +1,248 @@ +--- lib/Target/X86/X86ISelLowering.cpp 2016-04-07 01:11:55.018960678 +0300 ++++ lib/Target/X86/X86ISelLowering.cpp 2016-04-07 01:13:57.643965706 +0300 +@@ -1413,9 +1413,6 @@ + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); + if (Subtarget->hasDQI()) { +- setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); +- setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); +- + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); +@@ -1709,6 +1706,8 @@ + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + ++ setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); ++ setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); +@@ -11737,10 +11736,15 @@ + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); +- const TargetRegisterClass* rc = getRegClassFor(VecVT); +- if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) +- rc = getRegClassFor(MVT::v16i1); +- unsigned MaxSift = rc->getSize()*8 - 1; ++ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) { ++ // Use kshiftlw/rw instruction. ++ VecVT = MVT::v16i1; ++ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, ++ DAG.getUNDEF(VecVT), ++ Vec, ++ DAG.getIntPtrConstant(0, dl)); ++ } ++ unsigned MaxSift = VecVT.getVectorNumElements() - 1; + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, +--- lib/Target/X86/X86InstrAVX512.td 2016-04-07 01:11:55.020960678 +0300 ++++ lib/Target/X86/X86InstrAVX512.td 2016-04-07 01:12:30.680962140 +0300 +@@ -2043,9 +2043,6 @@ + VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, + VEX, XD; +-} +- +-let Predicates = [HasBWI] in { + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, +@@ -2085,8 +2082,27 @@ + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(store VK2:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; ++ def : Pat<(store VK1:$src, addr:$dst), ++ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; + } + let Predicates = [HasAVX512, NoDQI] in { ++ def : Pat<(store VK1:$src, addr:$dst), ++ (MOV8mr addr:$dst, ++ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), ++ sub_8bit))>; ++ def : Pat<(store VK2:$src, addr:$dst), ++ (MOV8mr addr:$dst, ++ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)), ++ sub_8bit))>; ++ def : Pat<(store VK4:$src, addr:$dst), ++ (MOV8mr addr:$dst, ++ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), ++ sub_8bit))>; ++ def : Pat<(store VK8:$src, addr:$dst), ++ (MOV8mr addr:$dst, ++ (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), ++ sub_8bit))>; ++ + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), +@@ -2166,6 +2182,17 @@ + def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; + ++def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; ++def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; ++def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; ++ ++def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), ++ (truncstore node:$val, node:$ptr), [{ ++ return cast(N)->getMemoryVT() == MVT::i1; ++}]>; ++ ++def : Pat<(truncstorei1 GR8:$src, addr:$dst), ++ (MOV8mr addr:$dst, GR8:$src)>; + + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. + let Predicates = [HasAVX512, NoDQI] in { +@@ -6540,28 +6567,6 @@ + def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; + def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; + +-def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +-def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +-def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; +- +-def : Pat<(store VK1:$src, addr:$dst), +- (MOV8mr addr:$dst, +- (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), +- sub_8bit))>, Requires<[HasAVX512, NoDQI]>; +- +-def : Pat<(store VK8:$src, addr:$dst), +- (MOV8mr addr:$dst, +- (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), +- sub_8bit))>, Requires<[HasAVX512, NoDQI]>; +- +-def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), +- (truncstore node:$val, node:$ptr), [{ +- return cast(N)->getMemoryVT() == MVT::i1; +-}]>; +- +-def : Pat<(truncstorei1 GR8:$src, addr:$dst), +- (MOV8mr addr:$dst, GR8:$src)>; +- + multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr > { + def rr : AVX512XS8IgetSize()) { ++ default: ++ llvm_unreachable("Unknown spill size"); ++ case 2: ++ return load ? X86::KMOVWkm : X86::KMOVWmk; ++ case 4: ++ return load ? X86::KMOVDkm : X86::KMOVDmk; ++ case 8: ++ return load ? X86::KMOVQkm : X86::KMOVQmk; ++ } ++} ++ + static unsigned getLoadStoreRegOpcode(unsigned Reg, + const TargetRegisterClass *RC, + bool isStackAligned, + const X86Subtarget &STI, + bool load) { + if (STI.hasAVX512()) { +- if (X86::VK8RegClass.hasSubClassEq(RC) || +- X86::VK16RegClass.hasSubClassEq(RC)) +- return load ? X86::KMOVWkm : X86::KMOVWmk; ++ if (isMaskRegClass(RC)) ++ return getLoadStoreMaskRegOpcode(RC, load); + if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; + if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) +--- lib/Target/X86/X86InstrInfo.td 2016-04-07 01:11:55.018960678 +0300 ++++ lib/Target/X86/X86InstrInfo.td 2016-04-07 01:14:17.400966516 +0300 +@@ -728,6 +728,8 @@ + + //===----------------------------------------------------------------------===// + // X86 Instruction Predicate Definitions. ++def TruePredicate : Predicate<"true">; ++ + def HasCMov : Predicate<"Subtarget->hasCMov()">; + def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +--- lib/Target/X86/X86InstrSSE.td 2016-04-07 01:11:55.014960678 +0300 ++++ lib/Target/X86/X86InstrSSE.td 2016-04-07 01:14:18.172966548 +0300 +@@ -4273,17 +4273,17 @@ + //===---------------------------------------------------------------------===// + + defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, +- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, +- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, +- SSE_INTALU_ITINS_P, 1, NoVLX>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, +- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, +- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, +- SSE_INTALU_ITINS_P, 0, NoVLX>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + + //===---------------------------------------------------------------------===// + // SSE2 - Packed Integer Shuffle Instructions +--- lib/Target/X86/X86RegisterInfo.td 2016-04-07 01:11:55.018960678 +0300 ++++ lib/Target/X86/X86RegisterInfo.td 2016-04-07 01:13:01.037963385 +0300 +@@ -477,18 +477,18 @@ + 256, (sequence "YMM%u", 0, 31)>; + + // Mask registers +-def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +-def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +-def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +-def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} ++def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} ++def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} ++def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} ++def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} + def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} + def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} + def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} + +-def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +-def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +-def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +-def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} ++def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} ++def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} ++def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} ++def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} + def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} + def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} + def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}