AVX bugfixes

2011-09-01 14:23:10 -07:00
parent 9cd92facbd
commit 08cad7a665
5 changed files with 29 additions and 29 deletions
--- a/builtins-avx-common.ll
+++ b/builtins-avx-common.ll
@@ -58,7 +58,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats

-declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone

 define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
  ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -78,7 +78,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
  ;  It doesn't matter what we pass as a, since we only need the r0 value
  ;  here.  So we pass the same register for both.
  %xi = insertelement <4 x float> undef, float %0, i32 0
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
@@ -87,7 +87,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
@@ -96,7 +96,7 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ; see above for round_ss instrinsic discussion...
  %xi = insertelement <4 x float> undef, float %0, i32 0
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
-  %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
+  %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
  %rs = extractelement <4 x float> %xr, i32 0
  ret float %rs
 }
--- a/builtins-avx-x2.ll
+++ b/builtins-avx-x2.ll
@@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
  %scalar1 = extractelement <8 x float> %v2, i32 0
-  %scalar2 = extractelement <8 x float> %v2, i32 1
+  %scalar2 = extractelement <8 x float> %v2, i32 4
  %sum = fadd float %scalar1, %scalar2
  ret float %sum
 }
@@ -316,9 +316,7 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw

  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %scalar1 = extractelement <4 x double> %sum0, i32 0
-  %scalar2 = extractelement <4 x double> %sum1, i32 1
-  %sum = fadd double %scalar1, %scalar2
+  %sum = extractelement <4 x double> %sum1, i32 0
  ret double %sum
 }

--- a/builtins-avx.ll
+++ b/builtins-avx.ll
@@ -220,7 +220,7 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
  %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
  %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
  %scalar1 = extractelement <8 x float> %v2, i32 0
-  %scalar2 = extractelement <8 x float> %v2, i32 1
+  %scalar2 = extractelement <8 x float> %v2, i32 4
  %sum = fadd float %scalar1, %scalar2
  ret float %sum
 }
@@ -294,11 +294,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
-  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
-  %scalar1 = extractelement <4 x double> %sum0, i32 0
-  %scalar2 = extractelement <4 x double> %sum1, i32 1
-  %sum = fadd double %scalar1, %scalar2
+  %sum01 = fadd <4 x double> %v0, %v1
+  %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
+  %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
+  %sum = extractelement <4 x double> %red1, i32 0
  ret double %sum
 }

--- a/run_tests.py
+++ b/run_tests.py
@@ -204,4 +204,6 @@ if __name__ == '__main__':
        t.join()
        error_count += t.exitcode
    print
+    if error_count > 0:
+        print "%d / %d tests FAILED!" % (error_count, total_tests)
    sys.exit(error_count)
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -64,8 +64,8 @@ int main(int argc, char *argv[]) {
    int w = width();
    assert(w <= 16);

-    float r1[16];
-    memset(r1, 0, 16*sizeof(float));
+    float returned_result[16];
+    memset(returned_result, 0, 16*sizeof(float));
    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
@@ -73,36 +73,37 @@ int main(int argc, char *argv[]) {
    float b = 5.;

 #if (TEST_SIG == 0)
-    f_v(r1);
+    f_v(returned_result);
 #elif (TEST_SIG == 1)
-    f_f(r1, vfloat);
+    f_f(returned_result, vfloat);
 #elif (TEST_SIG == 2)
-    f_fu(r1, vfloat, b);
+    f_fu(returned_result, vfloat, b);
 #elif (TEST_SIG == 3)
-    f_fi(r1, vfloat, vint);
+    f_fi(returned_result, vfloat, vint);
 #elif (TEST_SIG == 4)
-    f_du(r1, vdouble, 5.);
+    f_du(returned_result, vdouble, 5.);
 #elif (TEST_SIG == 5)
-    f_duf(r1, vdouble, 5.f);
+    f_duf(returned_result, vdouble, 5.f);
 #elif (TEST_SIG == 6)
-    f_di(r1, vdouble, vint2);
+    f_di(returned_result, vdouble, vint2);
 #else
 #error "Unknown or unset TEST_SIG value"
 #endif    

-    float r2[16];
-    memset(r2, 0, 16*sizeof(float));
-    result(r2);
+    float expected_result[16];
+    memset(expected_result, 0, 16*sizeof(float));
+    result(expected_result);

    int errors = 0;
    for (int i = 0; i < w; ++i) {
-        if (r1[i] != r2[i]) {
+        if (returned_result[i] != expected_result[i]) {
 #ifdef EXPECT_FAILURE
            // bingo, failed
            return 1;
 #else
-            printf("%s: value %d disagrees: should be %f [%a], returned %f [%a]\n",
-                   argv[0], i, r1[i], r1[i], r2[i], r2[i]);
+            printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
+                   argv[0], i, returned_result[i], returned_result[i], 
+                   expected_result[i], expected_result[i]);
            ++errors;
 #endif // EXPECT_FAILURE
        }