--- lib/Target/X86/X86ISelLowering.cpp 2016-04-07 01:11:55.018960678 +0300 +++ lib/Target/X86/X86ISelLowering.cpp 2016-04-07 01:13:57.643965706 +0300 @@ -1413,9 +1413,6 @@ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); @@ -1709,6 +1706,8 @@ addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); @@ -11737,10 +11736,15 @@ } unsigned IdxVal = cast(Idx)->getZExtValue(); - const TargetRegisterClass* rc = getRegClassFor(VecVT); - if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) - rc = getRegClassFor(MVT::v16i1); - unsigned MaxSift = rc->getSize()*8 - 1; + if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8)) { + // Use kshiftlw/rw instruction. + VecVT = MVT::v16i1; + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, + DAG.getUNDEF(VecVT), + Vec, + DAG.getIntPtrConstant(0, dl)); + } + unsigned MaxSift = VecVT.getVectorNumElements() - 1; Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, --- lib/Target/X86/X86InstrAVX512.td 2016-04-07 01:11:55.020960678 +0300 +++ lib/Target/X86/X86InstrAVX512.td 2016-04-07 01:12:30.680962140 +0300 @@ -2043,9 +2043,6 @@ VEX, PD, VEX_W; defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, VEX, XD; -} - -let Predicates = [HasBWI] in { defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, VEX, PS, VEX_W; defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, @@ -2085,8 +2082,27 @@ (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; def : Pat<(store VK2:$src, addr:$dst), (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(store VK1:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; } let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(store VK1:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), + sub_8bit))>; + def : Pat<(store VK2:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)), + sub_8bit))>; + def : Pat<(store VK4:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)), + sub_8bit))>; + def : Pat<(store VK8:$src, addr:$dst), + (MOV8mr addr:$dst, + (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), + sub_8bit))>; + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), @@ -2166,6 +2182,17 @@ def : Pat<(v64i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK64)>; +def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; +def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; + +def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i1; +}]>; + +def : Pat<(truncstorei1 GR8:$src, addr:$dst), + (MOV8mr addr:$dst, GR8:$src)>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512, NoDQI] in { @@ -6540,28 +6567,6 @@ def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; -def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; -def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; - -def : Pat<(store VK1:$src, addr:$dst), - (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), - sub_8bit))>, Requires<[HasAVX512, NoDQI]>; - -def : Pat<(store VK8:$src, addr:$dst), - (MOV8mr addr:$dst, - (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)), - sub_8bit))>, Requires<[HasAVX512, NoDQI]>; - -def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i1; -}]>; - -def : Pat<(truncstorei1 GR8:$src, addr:$dst), - (MOV8mr addr:$dst, GR8:$src)>; - multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8IgetSize()) { + default: + llvm_unreachable("Unknown spill size"); + case 2: + return load ? X86::KMOVWkm : X86::KMOVWmk; + case 4: + return load ? X86::KMOVDkm : X86::KMOVDmk; + case 8: + return load ? X86::KMOVQkm : X86::KMOVQmk; + } +} + static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI, bool load) { if (STI.hasAVX512()) { - if (X86::VK8RegClass.hasSubClassEq(RC) || - X86::VK16RegClass.hasSubClassEq(RC)) - return load ? X86::KMOVWkm : X86::KMOVWmk; + if (isMaskRegClass(RC)) + return getLoadStoreMaskRegOpcode(RC, load); if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) --- lib/Target/X86/X86InstrInfo.td 2016-04-07 01:11:55.018960678 +0300 +++ lib/Target/X86/X86InstrInfo.td 2016-04-07 01:14:17.400966516 +0300 @@ -728,6 +728,8 @@ //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. +def TruePredicate : Predicate<"true">; + def HasCMov : Predicate<"Subtarget->hasCMov()">; def NoCMov : Predicate<"!Subtarget->hasCMov()">; --- lib/Target/X86/X86InstrSSE.td 2016-04-07 01:11:55.014960678 +0300 +++ lib/Target/X86/X86InstrSSE.td 2016-04-07 01:14:18.172966548 +0300 @@ -4273,17 +4273,17 @@ //===---------------------------------------------------------------------===// defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 1, TruePredicate>; defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 1, TruePredicate>; defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1, NoVLX>; + SSE_INTALU_ITINS_P, 1, TruePredicate>; defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 0, TruePredicate>; defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; + SSE_INTALU_ITINS_P, 0, TruePredicate>; defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0, NoVLX>; + SSE_INTALU_ITINS_P, 0, TruePredicate>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions --- lib/Target/X86/X86RegisterInfo.td 2016-04-07 01:11:55.018960678 +0300 +++ lib/Target/X86/X86RegisterInfo.td 2016-04-07 01:13:01.037963385 +0300 @@ -477,18 +477,18 @@ 256, (sequence "YMM%u", 0, 31)>; // Mask registers -def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} -def VK2 : RegisterClass<"X86", [v2i1], 8, (add VK1)> {let Size = 8;} -def VK4 : RegisterClass<"X86", [v4i1], 8, (add VK2)> {let Size = 8;} -def VK8 : RegisterClass<"X86", [v8i1], 8, (add VK4)> {let Size = 8;} +def VK1 : RegisterClass<"X86", [i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} +def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} +def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} -def VK1WM : RegisterClass<"X86", [i1], 8, (sub VK1, K0)> {let Size = 8;} -def VK2WM : RegisterClass<"X86", [v2i1], 8, (sub VK2, K0)> {let Size = 8;} -def VK4WM : RegisterClass<"X86", [v4i1], 8, (sub VK4, K0)> {let Size = 8;} -def VK8WM : RegisterClass<"X86", [v8i1], 8, (sub VK8, K0)> {let Size = 8;} +def VK1WM : RegisterClass<"X86", [i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} +def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} +def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}