AVX bugfixes

This commit is contained in:
Matt Pharr
2011-09-01 14:23:10 -07:00
parent 9cd92facbd
commit 08cad7a665
5 changed files with 29 additions and 29 deletions

View File

@@ -58,7 +58,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline { define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
@@ -78,7 +78,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
; It doesn't matter what we pass as a, since we only need the r0 value ; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. ; here. So we pass the same register for both.
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
} }
@@ -87,7 +87,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
} }
@@ -96,7 +96,7 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
} }

View File

@@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2) %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
%scalar1 = extractelement <8 x float> %v2, i32 0 %scalar1 = extractelement <8 x float> %v2, i32 0
%scalar2 = extractelement <8 x float> %v2, i32 1 %scalar2 = extractelement <8 x float> %v2, i32 4
%sum = fadd float %scalar1, %scalar2 %sum = fadd float %scalar1, %scalar2
ret float %sum ret float %sum
} }
@@ -316,9 +316,7 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd) %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
%scalar1 = extractelement <4 x double> %sum0, i32 0 %sum = extractelement <4 x double> %sum1, i32 0
%scalar2 = extractelement <4 x double> %sum1, i32 1
%sum = fadd double %scalar1, %scalar2
ret double %sum ret double %sum
} }

View File

@@ -220,7 +220,7 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0) %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
%scalar1 = extractelement <8 x float> %v2, i32 0 %scalar1 = extractelement <8 x float> %v2, i32 0
%scalar2 = extractelement <8 x float> %v2, i32 1 %scalar2 = extractelement <8 x float> %v2, i32 4
%sum = fadd float %scalar1, %scalar2 %sum = fadd float %scalar1, %scalar2
ret float %sum ret float %sum
} }
@@ -294,11 +294,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%v1 = shufflevector <8 x double> %0, <8 x double> undef, %v1 = shufflevector <8 x double> %0, <8 x double> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7> <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) %sum01 = fadd <4 x double> %v0, %v1
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
%scalar1 = extractelement <4 x double> %sum0, i32 0 %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
%scalar2 = extractelement <4 x double> %sum1, i32 1 %sum = extractelement <4 x double> %red1, i32 0
%sum = fadd double %scalar1, %scalar2
ret double %sum ret double %sum
} }

View File

@@ -204,4 +204,6 @@ if __name__ == '__main__':
t.join() t.join()
error_count += t.exitcode error_count += t.exitcode
print print
if error_count > 0:
print "%d / %d tests FAILED!" % (error_count, total_tests)
sys.exit(error_count) sys.exit(error_count)

View File

@@ -64,8 +64,8 @@ int main(int argc, char *argv[]) {
int w = width(); int w = width();
assert(w <= 16); assert(w <= 16);
float r1[16]; float returned_result[16];
memset(r1, 0, 16*sizeof(float)); memset(returned_result, 0, 16*sizeof(float));
float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 }; int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
@@ -73,36 +73,37 @@ int main(int argc, char *argv[]) {
float b = 5.; float b = 5.;
#if (TEST_SIG == 0) #if (TEST_SIG == 0)
f_v(r1); f_v(returned_result);
#elif (TEST_SIG == 1) #elif (TEST_SIG == 1)
f_f(r1, vfloat); f_f(returned_result, vfloat);
#elif (TEST_SIG == 2) #elif (TEST_SIG == 2)
f_fu(r1, vfloat, b); f_fu(returned_result, vfloat, b);
#elif (TEST_SIG == 3) #elif (TEST_SIG == 3)
f_fi(r1, vfloat, vint); f_fi(returned_result, vfloat, vint);
#elif (TEST_SIG == 4) #elif (TEST_SIG == 4)
f_du(r1, vdouble, 5.); f_du(returned_result, vdouble, 5.);
#elif (TEST_SIG == 5) #elif (TEST_SIG == 5)
f_duf(r1, vdouble, 5.f); f_duf(returned_result, vdouble, 5.f);
#elif (TEST_SIG == 6) #elif (TEST_SIG == 6)
f_di(r1, vdouble, vint2); f_di(returned_result, vdouble, vint2);
#else #else
#error "Unknown or unset TEST_SIG value" #error "Unknown or unset TEST_SIG value"
#endif #endif
float r2[16]; float expected_result[16];
memset(r2, 0, 16*sizeof(float)); memset(expected_result, 0, 16*sizeof(float));
result(r2); result(expected_result);
int errors = 0; int errors = 0;
for (int i = 0; i < w; ++i) { for (int i = 0; i < w; ++i) {
if (r1[i] != r2[i]) { if (returned_result[i] != expected_result[i]) {
#ifdef EXPECT_FAILURE #ifdef EXPECT_FAILURE
// bingo, failed // bingo, failed
return 1; return 1;
#else #else
printf("%s: value %d disagrees: should be %f [%a], returned %f [%a]\n", printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
argv[0], i, r1[i], r1[i], r2[i], r2[i]); argv[0], i, returned_result[i], returned_result[i],
expected_result[i], expected_result[i]);
++errors; ++errors;
#endif // EXPECT_FAILURE #endif // EXPECT_FAILURE
} }