From 7abbe97ee9e73de69dece3ed384e381e0835fecd Mon Sep 17 00:00:00 2001
From: Ilia Filippov <ili.filippov@gmail.com>
Date: Fri, 11 Oct 2013 11:39:02 +0400
Subject: [PATCH] patch for LLVM for fails at avx-x2

---
 fail_db.txt                             | 16 -----
 llvm_patches/3_3_0001-Fix-PR16807.patch | 78 +++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 16 deletions(-)
 create mode 100755 llvm_patches/3_3_0001-Fix-PR16807.patch

diff --git a/fail_db.txt b/fail_db.txt
index 31db9961..7c543cc6 100644
--- a/fail_db.txt
+++ b/fail_db.txt
@@ -37,10 +37,6 @@
 ./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-4.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-6.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
 ./tests/atomics-swap.ispc runfail  x86-64      generic-4   Linux LLVM 3.3     g++4.4 -O2 *
@@ -158,10 +154,6 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-ifs.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-multi-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
 ./tests/array-gather-unif.ispc runfail  x86-64      generic-4   Linux LLVM 3.4     g++4.4 -O2 *
@@ -362,10 +354,6 @@
 ./tests/reduce-equal-13.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/reduce-equal-5.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/reduce-equal-6.ispc compfail  x86-64     sse4-i8x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.3     g++4.7 -O2 *
@@ -425,10 +413,6 @@
 ./tests/funcptr-null-4.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/funcptr-null-5.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/funcptr-null-6.ispc runfail  x86-64     sse4-i8x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail     x86    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-down-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
-./tests/avg-up-int8.ispc compfail  x86-64    avx1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-up-int8.ispc compfail     x86  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
 ./tests/avg-down-int8.ispc compfail  x86-64  avx1.1-i32x16   Linux LLVM 3.4     g++4.7 -O2 *
diff --git a/llvm_patches/3_3_0001-Fix-PR16807.patch b/llvm_patches/3_3_0001-Fix-PR16807.patch
new file mode 100755
index 00000000..daf1327c
--- /dev/null
+++ b/llvm_patches/3_3_0001-Fix-PR16807.patch
@@ -0,0 +1,78 @@
+From b9c47f44691cb9a648b9fa1ae373f0defe53c757 Mon Sep 17 00:00:00 2001
+From: Michael Liao <michael.hliao@gmail.com>
+Date: Thu, 10 Oct 2013 16:47:00 -0700
+Subject: [PATCH] Fix PR16807
+
+- Lower signed division by constant powers-of-2 to target-independent
+  DAG operators instead of target-dependent ones to support them on
+  targets where vector types are legal but shift operators on that types
+  are illegal, e.g. on AVX, PSRAW is only available on <8 x i16> though
+  <16 x i16> is a legal type.
+---
+ lib/Target/X86/X86ISelLowering.cpp | 22 ++++++++++++++++------
+ test/CodeGen/X86/pr16807.ll        | 18 ++++++++++++++++++
+ 2 files changed, 34 insertions(+), 6 deletions(-)
+ create mode 100644 test/CodeGen/X86/pr16807.ll
+
+diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
+index bd5ad4e..518bb90 100644
+--- lib/Target/X86/X86ISelLowering.cpp
++++ lib/Target/X86/X86ISelLowering.cpp
+@@ -12462,14 +12462,24 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+     unsigned lg2 = SplatValue.countTrailingZeros();
+     // Splat the sign bit.
+-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
++    SmallVector<SDValue, 16> Sz(NumElts,
++                                DAG.getConstant(EltTy.getSizeInBits() - 1,
++                                                EltTy));
++    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
++                                          NumElts));
+     // Add (N0 < 0) ? abs2 - 1 : 0;
+-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
++    SmallVector<SDValue, 16> Amt(NumElts,
++                                 DAG.getConstant(EltTy.getSizeInBits() - lg2,
++                                                 EltTy));
++    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
++                                          NumElts));
+     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
++    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(lg2, EltTy));
++    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
++                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
++                                          NumElts));
+ 
+     // If we're dividing by a positive value, we're done.  Otherwise, we must
+     // negate the result.
+diff --git a/test/CodeGen/X86/pr16807.ll b/test/CodeGen/X86/pr16807.ll
+new file mode 100644
+index 0000000..6d55d99
+--- /dev/null
++++ test/CodeGen/X86/pr16807.ll
+@@ -0,0 +1,18 @@
++; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx-i | FileCheck %s
++
++define <16 x i16> @f_fu(<16 x i16> %bf) {
++allocas:
++  %avg.i.i = sdiv <16 x i16> %bf, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
++  ret <16 x i16> %avg.i.i
++}
++
++; CHECK: f_fu
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: psraw
++; CHECK: psrlw
++; CHECK: paddw
++; CHECK: psraw
++; CHECK: ret
+-- 
+1.8.1.2
+