AVX bugfixes
This commit is contained in:
@@ -58,7 +58,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||
@@ -78,7 +78,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||
; here. So we pass the same register for both.
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
@@ -87,7 +87,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
@@ -96,7 +96,7 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
||||
; see above for round_ss instrinsic discussion...
|
||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||
%rs = extractelement <4 x float> %xr, i32 0
|
||||
ret float %rs
|
||||
}
|
||||
|
||||
@@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 1
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
@@ -316,9 +316,7 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
|
||||
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
||||
%sum = fadd double %scalar1, %scalar2
|
||||
%sum = extractelement <4 x double> %sum1, i32 0
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
|
||||
@@ -220,7 +220,7 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 1
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
@@ -294,11 +294,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
||||
%sum = fadd double %scalar1, %scalar2
|
||||
%sum01 = fadd <4 x double> %v0, %v1
|
||||
%red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
|
||||
%red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
|
||||
%sum = extractelement <4 x double> %red1, i32 0
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
|
||||
@@ -204,4 +204,6 @@ if __name__ == '__main__':
|
||||
t.join()
|
||||
error_count += t.exitcode
|
||||
print
|
||||
if error_count > 0:
|
||||
print "%d / %d tests FAILED!" % (error_count, total_tests)
|
||||
sys.exit(error_count)
|
||||
|
||||
@@ -64,8 +64,8 @@ int main(int argc, char *argv[]) {
|
||||
int w = width();
|
||||
assert(w <= 16);
|
||||
|
||||
float r1[16];
|
||||
memset(r1, 0, 16*sizeof(float));
|
||||
float returned_result[16];
|
||||
memset(returned_result, 0, 16*sizeof(float));
|
||||
float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||
double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||
int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
|
||||
@@ -73,36 +73,37 @@ int main(int argc, char *argv[]) {
|
||||
float b = 5.;
|
||||
|
||||
#if (TEST_SIG == 0)
|
||||
f_v(r1);
|
||||
f_v(returned_result);
|
||||
#elif (TEST_SIG == 1)
|
||||
f_f(r1, vfloat);
|
||||
f_f(returned_result, vfloat);
|
||||
#elif (TEST_SIG == 2)
|
||||
f_fu(r1, vfloat, b);
|
||||
f_fu(returned_result, vfloat, b);
|
||||
#elif (TEST_SIG == 3)
|
||||
f_fi(r1, vfloat, vint);
|
||||
f_fi(returned_result, vfloat, vint);
|
||||
#elif (TEST_SIG == 4)
|
||||
f_du(r1, vdouble, 5.);
|
||||
f_du(returned_result, vdouble, 5.);
|
||||
#elif (TEST_SIG == 5)
|
||||
f_duf(r1, vdouble, 5.f);
|
||||
f_duf(returned_result, vdouble, 5.f);
|
||||
#elif (TEST_SIG == 6)
|
||||
f_di(r1, vdouble, vint2);
|
||||
f_di(returned_result, vdouble, vint2);
|
||||
#else
|
||||
#error "Unknown or unset TEST_SIG value"
|
||||
#endif
|
||||
|
||||
float r2[16];
|
||||
memset(r2, 0, 16*sizeof(float));
|
||||
result(r2);
|
||||
float expected_result[16];
|
||||
memset(expected_result, 0, 16*sizeof(float));
|
||||
result(expected_result);
|
||||
|
||||
int errors = 0;
|
||||
for (int i = 0; i < w; ++i) {
|
||||
if (r1[i] != r2[i]) {
|
||||
if (returned_result[i] != expected_result[i]) {
|
||||
#ifdef EXPECT_FAILURE
|
||||
// bingo, failed
|
||||
return 1;
|
||||
#else
|
||||
printf("%s: value %d disagrees: should be %f [%a], returned %f [%a]\n",
|
||||
argv[0], i, r1[i], r1[i], r2[i], r2[i]);
|
||||
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||
argv[0], i, returned_result[i], returned_result[i],
|
||||
expected_result[i], expected_result[i]);
|
||||
++errors;
|
||||
#endif // EXPECT_FAILURE
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user