diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch new file mode 100644 index 00000000..8f0a790b --- /dev/null +++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch @@ -0,0 +1,115 @@ +From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 +From: Michael Liao +Date: Mon, 21 Oct 2013 17:47:58 -0700 +Subject: [PATCH] Fix PR17631 + +- Skip instructions added in prolog. For specific targets, prolog may + insert helper function calls (e.g. _chkstk will be called when + there're more than 4K bytes allocated on stack). However, these + helpers don't use/def YMM/XMM registers. + It also include second fix for the problem: r196261+r196391. + +diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp +index 477f75a..0d37a7d 100644 +--- lib/Target/X86/X86VZeroUpper.cpp ++++ lib/Target/X86/X86VZeroUpper.cpp +@@ -121,7 +121,7 @@ + } + + static bool clobbersAllYmmRegs(const MachineOperand &MO) { +- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } +@@ -143,6 +143,21 @@ + return false; + } + ++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this ++/// instruction. ++static bool clobbersAnyYmmReg(MachineInstr *MI) { ++ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { ++ const MachineOperand &MO = MI->getOperand(i); ++ if (!MO.isRegMask()) ++ continue; ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { ++ if (MO.clobbersPhysReg(reg)) ++ return true; ++ } ++ } ++ return false; ++} ++ + /// runOnMachineFunction - Loop over all of the basic blocks, inserting + /// vzero upper instructions before function calls. + bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { +@@ -226,8 +241,9 @@ + bool BBHasCall = false; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { ++ DebugLoc dl = I->getDebugLoc(); + MachineInstr *MI = I; +- DebugLoc dl = I->getDebugLoc(); ++ + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. +@@ -246,6 +262,14 @@ + if (!isControlFlow) + continue; + ++ // If the call won't clobber any YMM register, skip it as well. It usually ++ // happens on helper function calls (such as '_chkstk', '_ftol2') where ++ // standard calling convention is not used (RegMask is not used to mark ++ // register clobbered and register usage (def/imp-def/use) is well-dfined ++ // and explicitly specified. ++ if (MI->isCall() && !clobbersAnyYmmReg(MI)) ++ continue; ++ + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX +diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll +new file mode 100644 +index 0000000..a572ff2 +--- /dev/null ++++ test/CodeGen/X86/pr17631.ll +@@ -0,0 +1,34 @@ ++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s ++ ++%struct_type = type { [64 x <8 x float>], <8 x float> } ++ ++; Function Attrs: nounwind readnone ++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) ++ ++; Function Attrs: nounwind ++define i32 @equal(<8 x i32> %A) { ++allocas: ++ %first_alloc = alloca [64 x <8 x i32>] ++ %second_alloc = alloca %struct_type ++ ++ %A1 = bitcast <8 x i32> %A to <8 x float> ++ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) ++ ret i32 %A2 ++} ++ ++; CHECK: equal ++; CHECK-NOT: vzeroupper ++; CHECK: _chkstk ++; CHECK: ret ++ ++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { ++ %i = fptoui double %x to i64 ++ store i64 %i, i64* %p ++ %ret = fadd <8 x float> %y, %y ++ ret <8 x float> %ret ++} ++ ++; CHECK: foo ++; CHECK-NOT: vzeroupper ++; CHECK: _ftol2 ++; CHECK: ret +-- +1.8.1.2 + diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch deleted file mode 100644 index b6abb1d3..00000000 --- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch +++ /dev/null @@ -1,69 +0,0 @@ -From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 -From: Michael Liao -Date: Mon, 21 Oct 2013 17:47:58 -0700 -Subject: [PATCH] Fix PR17631 - -- Skip instructions added in prolog. For specific targets, prolog may - insert helper function calls (e.g. _chkstk will be called when - there're more than 4K bytes allocated on stack). However, these - helpers don't use/def YMM/XMM registers. ---- - lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++- - test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++ - 2 files changed, 32 insertions(+), 1 deletion(-) - create mode 100644 test/CodeGen/X86/pr17631.ll - -diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp -index 477f75a..0d37a7d 100644 ---- lib/Target/X86/X86VZeroUpper.cpp -+++ lib/Target/X86/X86VZeroUpper.cpp -@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - bool BBHasCall = false; - - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { -- MachineInstr *MI = I; - DebugLoc dl = I->getDebugLoc(); -+ MachineInstr *MI = I; -+ -+ // Don't need to check instructions added in prolog. -+ // In prolog, special function calls may be added for specific targets -+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local -+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM -+ // registers. -+ if (MI->getFlag(MachineInstr::FrameSetup)) -+ continue; -+ - bool isControlFlow = MI->isCall() || MI->isReturn(); - - // Shortcut: don't need to check regular instructions in dirty state. -diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll -new file mode 100644 -index 0000000..a572ff2 ---- /dev/null -+++ test/CodeGen/X86/pr17631.ll -@@ -0,0 +1,22 @@ -+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s -+ -+%struct_type = type { [64 x <8 x float>], <8 x float> } -+ -+; Function Attrs: nounwind readnone -+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) -+ -+; Function Attrs: nounwind -+define i32 @equal(<8 x i32> %A) { -+allocas: -+ %first_alloc = alloca [64 x <8 x i32>] -+ %second_alloc = alloca %struct_type -+ -+ %A1 = bitcast <8 x i32> %A to <8 x float> -+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) -+ ret i32 %A2 -+} -+ -+; CHECK: equal -+; CHECK-NOT: vzeroupper -+; CHECK: _chkstk -+; CHECK: ret --- -1.8.1.2 -