diff --git a/llvm_patches/3_8_r260878_mask_store_size.patch b/llvm_patches/3_8_r260878_mask_store_size.patch new file mode 100644 index 00000000..4477d532 --- /dev/null +++ b/llvm_patches/3_8_r260878_mask_store_size.patch @@ -0,0 +1,263 @@ +# This patch merges slightly modified revisions for SKX capability with LLVM 3.8 +# revisions: r260878, r258867 +Index: test/CodeGen/X86/avx512-intel-ocl.ll +=================================================================== +--- test/CodeGen/X86/avx512-intel-ocl.ll (revision 260877) ++++ test/CodeGen/X86/avx512-intel-ocl.ll (revision 260878) +@@ -68,10 +68,10 @@ + ; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload + + ; X64-LABEL: test_prolog_epilog +-; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill +-; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill +-; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill +-; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill ++; X64: kmovq %k7, {{.*}}(%rsp) ## 8-byte Folded Spill ++; X64: kmovq %k6, {{.*}}(%rsp) ## 8-byte Folded Spill ++; X64: kmovq %k5, {{.*}}(%rsp) ## 8-byte Folded Spill ++; X64: kmovq %k4, {{.*}}(%rsp) ## 8-byte Folded Spill + ; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill + ; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill + ; X64: call +Index: test/CodeGen/X86/avx512-mask-spills.ll +=================================================================== +--- test/CodeGen/X86/avx512-mask-spills.ll (revision 0) ++++ test/CodeGen/X86/avx512-mask-spills.ll (revision 260878) +@@ -0,0 +1,126 @@ ++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ++; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX ++ ++declare void @f() ++define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ++; CHECK-LABEL: test_4i1: ++; CHECK: ## BB#0: ++; CHECK-NEXT: pushq %rax ++; CHECK-NEXT: Ltmp0: ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: callq _f ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload ++; CHECK-NEXT: korw %k1, %k0, %k0 ++; CHECK-NEXT: vpmovm2d %k0, %xmm0 ++; CHECK-NEXT: popq %rax ++; CHECK-NEXT: retq ++ ++ %cmp_res = icmp ugt <4 x i32> %a, %b ++ %cmp_res2 = icmp sgt <4 x i32> %a, %b ++ call void @f() ++ %res = or <4 x i1> %cmp_res, %cmp_res2 ++ ret <4 x i1> %res ++} ++ ++define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ++; CHECK-LABEL: test_8i1: ++; CHECK: ## BB#0: ++; CHECK-NEXT: pushq %rax ++; CHECK-NEXT: Ltmp1: ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: callq _f ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload ++; CHECK-NEXT: korb %k1, %k0, %k0 ++; CHECK-NEXT: vpmovm2w %k0, %xmm0 ++; CHECK-NEXT: popq %rax ++; CHECK-NEXT: retq ++ ++ %cmp_res = icmp ugt <8 x i32> %a, %b ++ %cmp_res2 = icmp sgt <8 x i32> %a, %b ++ call void @f() ++ %res = or <8 x i1> %cmp_res, %cmp_res2 ++ ret <8 x i1> %res ++} ++ ++define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ++; CHECK-LABEL: test_16i1: ++; CHECK: ## BB#0: ++; CHECK-NEXT: pushq %rax ++; CHECK-NEXT: Ltmp2: ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Folded Spill ++; CHECK-NEXT: callq _f ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Folded Reload ++; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Folded Reload ++; CHECK-NEXT: korw %k1, %k0, %k0 ++; CHECK-NEXT: vpmovm2b %k0, %xmm0 ++; CHECK-NEXT: popq %rax ++; CHECK-NEXT: retq ++ %cmp_res = icmp ugt <16 x i32> %a, %b ++ %cmp_res2 = icmp sgt <16 x i32> %a, %b ++ call void @f() ++ %res = or <16 x i1> %cmp_res, %cmp_res2 ++ ret <16 x i1> %res ++} ++ ++define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ++; CHECK-LABEL: test_32i1: ++; CHECK: ## BB#0: ++; CHECK-NEXT: pushq %rax ++; CHECK-NEXT: Ltmp3: ++; CHECK-NEXT: .cfi_def_cfa_offset 16 ++; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Folded Spill ++; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Folded Spill ++; CHECK-NEXT: callq _f ++; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Folded Reload ++; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Folded Reload ++; CHECK-NEXT: kord %k1, %k0, %k0 ++; CHECK-NEXT: vpmovm2b %k0, %ymm0 ++; CHECK-NEXT: popq %rax ++; CHECK-NEXT: retq ++ %cmp_res = icmp ugt <32 x i16> %a, %b ++ %cmp_res2 = icmp sgt <32 x i16> %a, %b ++ call void @f() ++ %res = or <32 x i1> %cmp_res, %cmp_res2 ++ ret <32 x i1> %res ++} ++ ++define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ++; CHECK-LABEL: test_64i1: ++; CHECK: ## BB#0: ++; CHECK-NEXT: subq $24, %rsp ++; CHECK-NEXT: Ltmp4: ++; CHECK-NEXT: .cfi_def_cfa_offset 32 ++; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Folded Spill ++; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 ++; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Folded Spill ++; CHECK-NEXT: callq _f ++; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Folded Reload ++; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Folded Reload ++; CHECK-NEXT: korq %k1, %k0, %k0 ++; CHECK-NEXT: vpmovm2b %k0, %zmm0 ++; CHECK-NEXT: addq $24, %rsp ++; CHECK-NEXT: retq ++ ++ %cmp_res = icmp ugt <64 x i8> %a, %b ++ %cmp_res2 = icmp sgt <64 x i8> %a, %b ++ call void @f() ++ %res = or <64 x i1> %cmp_res, %cmp_res2 ++ ret <64 x i1> %res ++} +Index: lib/Target/X86/X86RegisterInfo.td +=================================================================== +--- lib/Target/X86/X86RegisterInfo.td (revision 260877) ++++ lib/Target/X86/X86RegisterInfo.td (revision 260878) +@@ -477,18 +477,18 @@ + 256, (sequence "YMM%u", 0, 31)>; + + // Mask registers +-def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} +-def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} +-def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} +-def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} ++def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} ++def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} ++def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} ++def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} + def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} + def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} + def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} + +-def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} +-def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} +-def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} +-def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} ++def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} ++def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} ++def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} ++def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} + def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} + def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} + def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} +Index: lib/Target/X86/X86InstrInfo.cpp +=================================================================== +--- lib/Target/X86/X86InstrInfo.cpp (revision 260877) ++++ lib/Target/X86/X86InstrInfo.cpp (revision 260878) +@@ -4302,12 +4302,14 @@ + return 0; + } + ++static bool isMaskRegClass(const TargetRegisterClass *RC) { ++ // All KMASK RegClasses hold the same k registers, can be tested against anyone. ++ return X86::VK16RegClass.hasSubClassEq(RC); ++} ++ + static bool MaskRegClassContains(unsigned Reg) { +- return X86::VK8RegClass.contains(Reg) || +- X86::VK16RegClass.contains(Reg) || +- X86::VK32RegClass.contains(Reg) || +- X86::VK64RegClass.contains(Reg) || +- X86::VK1RegClass.contains(Reg); ++ // All KMASK RegClasses hold the same k registers, can be tested against anyone. ++ return X86::VK16RegClass.contains(Reg); + } + + static bool GRRegClassContains(unsigned Reg) { +@@ -4509,15 +4511,28 @@ + llvm_unreachable("Cannot emit physreg copy instruction"); + } + ++static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC, ++ bool load) { ++ switch (RC->getSize()) { ++ default: ++ llvm_unreachable("Unknown spill size"); ++ case 2: ++ return load ? X86::KMOVWkm : X86::KMOVWmk; ++ case 4: ++ return load ? X86::KMOVDkm : X86::KMOVDmk; ++ case 8: ++ return load ? X86::KMOVQkm : X86::KMOVQmk; ++ } ++} ++ + static unsigned getLoadStoreRegOpcode(unsigned Reg, + const TargetRegisterClass *RC, + bool isStackAligned, + const X86Subtarget &STI, + bool load) { + if (STI.hasAVX512()) { +- if (X86::VK8RegClass.hasSubClassEq(RC) || +- X86::VK16RegClass.hasSubClassEq(RC)) +- return load ? X86::KMOVWkm : X86::KMOVWmk; ++ if (isMaskRegClass(RC)) ++ return getLoadStoreMaskRegOpcode(RC, load); + if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) + return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; + if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) +Index: lib/Target/X86/X86ISelLowering.cpp +=================================================================== +--- lib/Target/X86/X86ISelLowering.cpp (revision 260877) ++++ lib/Target/X86/X86ISelLowering.cpp (revision 260878) +@@ -12123,10 +12123,15 @@ + } + + unsigned IdxVal = cast(Idx)->getZExtValue(); +- const TargetRegisterClass* rc = getRegClassFor(VecVT); +- if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) +- rc = getRegClassFor(MVT::v16i1); +- unsigned MaxSift = rc->getSize()*8 - 1; ++ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) { ++ // Use kshiftlw/rw instruction. ++ VecVT = MVT::v16i1; ++ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, ++ DAG.getUNDEF(VecVT), ++ Vec, ++ DAG.getIntPtrConstant(0, dl)); ++ } ++ unsigned MaxSift = VecVT.getVectorNumElements() - 1; + Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, diff --git a/llvm_patches/3_8_r261619-avx512-PCMPGT.patch b/llvm_patches/3_8_r261619-avx512-PCMPGT.patch new file mode 100644 index 00000000..d014897e --- /dev/null +++ b/llvm_patches/3_8_r261619-avx512-PCMPGT.patch @@ -0,0 +1,54 @@ +diff -ruN lib/Target/X86/X86InstrInfo.td lib/Target/X86/X86InstrInfo.td +--- lib/Target/X86/X86InstrInfo.td 2016-03-04 12:36:50.461576093 +0300 ++++ lib/Target/X86/X86InstrInfo.td 2016-03-04 12:38:58.747585762 +0300 +@@ -728,6 +728,8 @@ + + //===----------------------------------------------------------------------===// + // X86 Instruction Predicate Definitions. ++def TruePredicate : Predicate<"true">; ++ + def HasCMov : Predicate<"Subtarget->hasCMov()">; + def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +diff -ruN lib/Target/X86/X86InstrSSE.td lib/Target/X86/X86InstrSSE.td +--- lib/Target/X86/X86InstrSSE.td 2016-03-04 12:36:50.472576094 +0300 ++++ lib/Target/X86/X86InstrSSE.td 2016-03-04 12:41:38.419597797 +0300 +@@ -4273,17 +4273,17 @@ + //===---------------------------------------------------------------------===// + + defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, +- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, +- SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, +- SSE_INTALU_ITINS_P, 1, NoVLX>; ++ SSE_INTALU_ITINS_P, 1, TruePredicate>; + defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, +- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, +- SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, +- SSE_INTALU_ITINS_P, 0, NoVLX>; ++ SSE_INTALU_ITINS_P, 0, TruePredicate>; + + //===---------------------------------------------------------------------===// + // SSE2 - Packed Integer Shuffle Instructions +diff -ruN test/CodeGen/X86/avx-isa-check.ll test/CodeGen/X86/avx-isa-check.ll +--- test/CodeGen/X86/avx-isa-check.ll 2016-03-04 12:36:45.218575698 +0300 ++++ test/CodeGen/X86/avx-isa-check.ll 2016-03-04 12:44:06.705608973 +0300 +@@ -568,3 +568,11 @@ + %shift = shl <8 x i16> %a, + ret <8 x i16> %shift + } ++ ++define <32 x i8> @test_cmpgtb(<32 x i8> %A) { ++; generate the follow code ++; vpxor %ymm1, %ymm1, %ymm1 ++; vpcmpgtb %ymm0, %ymm1, %ymm0 ++ %B = ashr <32 x i8> %A, ++ ret <32 x i8> %B ++}