diff --git a/builtins-avx-common.ll b/builtins-avx-common.ll index 579e471d..49aa8664 100644 --- a/builtins-avx-common.ll +++ b/builtins-avx-common.ll @@ -58,7 +58,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats -declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 @@ -78,7 +78,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli ; It doesn't matter what we pass as a, since we only need the r0 value ; here. So we pass the same register for both. %xi = insertelement <4 x float> undef, float %0, i32 0 - %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) %rs = extractelement <4 x float> %xr, i32 0 ret float %rs } @@ -87,7 +87,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli ; see above for round_ss instrinsic discussion... %xi = insertelement <4 x float> undef, float %0, i32 0 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 - %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) %rs = extractelement <4 x float> %xr, i32 0 ret float %rs } @@ -96,7 +96,7 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin ; see above for round_ss instrinsic discussion... %xi = insertelement <4 x float> undef, float %0, i32 0 ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 - %xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) + %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) %rs = extractelement <4 x float> %xr, i32 0 ret float %rs } diff --git a/builtins-avx-x2.ll b/builtins-avx-x2.ll index d5696097..b7f1d382 100644 --- a/builtins-avx-x2.ll +++ b/builtins-avx-x2.ll @@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2) %scalar1 = extractelement <8 x float> %v2, i32 0 - %scalar2 = extractelement <8 x float> %v2, i32 1 + %scalar2 = extractelement <8 x float> %v2, i32 4 %sum = fadd float %scalar1, %scalar2 ret float %sum } @@ -316,9 +316,7 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd) %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) - %scalar1 = extractelement <4 x double> %sum0, i32 0 - %scalar2 = extractelement <4 x double> %sum1, i32 1 - %sum = fadd double %scalar1, %scalar2 + %sum = extractelement <4 x double> %sum1, i32 0 ret double %sum } diff --git a/builtins-avx.ll b/builtins-avx.ll index 6b8faf39..055fc7bb 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -220,7 +220,7 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0) %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %scalar1 = extractelement <8 x float> %v2, i32 0 - %scalar2 = extractelement <8 x float> %v2, i32 1 + %scalar2 = extractelement <8 x float> %v2, i32 4 %sum = fadd float %scalar1, %scalar2 ret float %sum } @@ -294,11 +294,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa <4 x i32> %v1 = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> - %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) - %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) - %scalar1 = extractelement <4 x double> %sum0, i32 0 - %scalar2 = extractelement <4 x double> %sum1, i32 1 - %sum = fadd double %scalar1, %scalar2 + %sum01 = fadd <4 x double> %v0, %v1 + %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01) + %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0) + %sum = extractelement <4 x double> %red1, i32 0 ret double %sum } diff --git a/run_tests.py b/run_tests.py index d72bc89a..e5b7390d 100755 --- a/run_tests.py +++ b/run_tests.py @@ -204,4 +204,6 @@ if __name__ == '__main__': t.join() error_count += t.exitcode print + if error_count > 0: + print "%d / %d tests FAILED!" % (error_count, total_tests) sys.exit(error_count) diff --git a/test_static.cpp b/test_static.cpp index caee3332..0ee4810a 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -64,8 +64,8 @@ int main(int argc, char *argv[]) { int w = width(); assert(w <= 16); - float r1[16]; - memset(r1, 0, 16*sizeof(float)); + float returned_result[16]; + memset(returned_result, 0, 16*sizeof(float)); float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 }; @@ -73,36 +73,37 @@ int main(int argc, char *argv[]) { float b = 5.; #if (TEST_SIG == 0) - f_v(r1); + f_v(returned_result); #elif (TEST_SIG == 1) - f_f(r1, vfloat); + f_f(returned_result, vfloat); #elif (TEST_SIG == 2) - f_fu(r1, vfloat, b); + f_fu(returned_result, vfloat, b); #elif (TEST_SIG == 3) - f_fi(r1, vfloat, vint); + f_fi(returned_result, vfloat, vint); #elif (TEST_SIG == 4) - f_du(r1, vdouble, 5.); + f_du(returned_result, vdouble, 5.); #elif (TEST_SIG == 5) - f_duf(r1, vdouble, 5.f); + f_duf(returned_result, vdouble, 5.f); #elif (TEST_SIG == 6) - f_di(r1, vdouble, vint2); + f_di(returned_result, vdouble, vint2); #else #error "Unknown or unset TEST_SIG value" #endif - float r2[16]; - memset(r2, 0, 16*sizeof(float)); - result(r2); + float expected_result[16]; + memset(expected_result, 0, 16*sizeof(float)); + result(expected_result); int errors = 0; for (int i = 0; i < w; ++i) { - if (r1[i] != r2[i]) { + if (returned_result[i] != expected_result[i]) { #ifdef EXPECT_FAILURE // bingo, failed return 1; #else - printf("%s: value %d disagrees: should be %f [%a], returned %f [%a]\n", - argv[0], i, r1[i], r1[i], r2[i], r2[i]); + printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n", + argv[0], i, returned_result[i], returned_result[i], + expected_result[i], expected_result[i]); ++errors; #endif // EXPECT_FAILURE }