AVX bugfixes
This commit is contained in:
@@ -58,7 +58,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
|
|||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; rounding floats
|
;; rounding floats
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
define internal float @__round_uniform_float(float) nounwind readonly alwaysinline {
|
||||||
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
|
||||||
@@ -78,7 +78,7 @@ define internal float @__round_uniform_float(float) nounwind readonly alwaysinli
|
|||||||
; It doesn't matter what we pass as a, since we only need the r0 value
|
; It doesn't matter what we pass as a, since we only need the r0 value
|
||||||
; here. So we pass the same register for both.
|
; here. So we pass the same register for both.
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
@@ -87,7 +87,7 @@ define internal float @__floor_uniform_float(float) nounwind readonly alwaysinli
|
|||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
@@ -96,7 +96,7 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
|
|||||||
; see above for round_ss instrinsic discussion...
|
; see above for round_ss instrinsic discussion...
|
||||||
%xi = insertelement <4 x float> undef, float %0, i32 0
|
%xi = insertelement <4 x float> undef, float %0, i32 0
|
||||||
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
|
||||||
%xr = call <4 x float> @llvm.x86.sse.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 10)
|
||||||
%rs = extractelement <4 x float> %xr, i32 0
|
%rs = extractelement <4 x float> %xr, i32 0
|
||||||
ret float %rs
|
ret float %rs
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
|
|||||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||||
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||||
%scalar2 = extractelement <8 x float> %v2, i32 1
|
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||||
%sum = fadd float %scalar1, %scalar2
|
%sum = fadd float %scalar1, %scalar2
|
||||||
ret float %sum
|
ret float %sum
|
||||||
}
|
}
|
||||||
@@ -316,9 +316,7 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
|
|||||||
|
|
||||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
%sum = extractelement <4 x double> %sum1, i32 0
|
||||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
|
||||||
%sum = fadd double %scalar1, %scalar2
|
|
||||||
ret double %sum
|
ret double %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -220,7 +220,7 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
|
|||||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %0)
|
||||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||||
%scalar2 = extractelement <8 x float> %v2, i32 1
|
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||||
%sum = fadd float %scalar1, %scalar2
|
%sum = fadd float %scalar1, %scalar2
|
||||||
ret float %sum
|
ret float %sum
|
||||||
}
|
}
|
||||||
@@ -294,11 +294,10 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
|
|||||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
%sum01 = fadd <4 x double> %v0, %v1
|
||||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
%red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
|
||||||
%scalar1 = extractelement <4 x double> %sum0, i32 0
|
%red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
|
||||||
%scalar2 = extractelement <4 x double> %sum1, i32 1
|
%sum = extractelement <4 x double> %red1, i32 0
|
||||||
%sum = fadd double %scalar1, %scalar2
|
|
||||||
ret double %sum
|
ret double %sum
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -204,4 +204,6 @@ if __name__ == '__main__':
|
|||||||
t.join()
|
t.join()
|
||||||
error_count += t.exitcode
|
error_count += t.exitcode
|
||||||
print
|
print
|
||||||
|
if error_count > 0:
|
||||||
|
print "%d / %d tests FAILED!" % (error_count, total_tests)
|
||||||
sys.exit(error_count)
|
sys.exit(error_count)
|
||||||
|
|||||||
@@ -64,8 +64,8 @@ int main(int argc, char *argv[]) {
|
|||||||
int w = width();
|
int w = width();
|
||||||
assert(w <= 16);
|
assert(w <= 16);
|
||||||
|
|
||||||
float r1[16];
|
float returned_result[16];
|
||||||
memset(r1, 0, 16*sizeof(float));
|
memset(returned_result, 0, 16*sizeof(float));
|
||||||
float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||||
double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
|
||||||
int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
|
int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
|
||||||
@@ -73,36 +73,37 @@ int main(int argc, char *argv[]) {
|
|||||||
float b = 5.;
|
float b = 5.;
|
||||||
|
|
||||||
#if (TEST_SIG == 0)
|
#if (TEST_SIG == 0)
|
||||||
f_v(r1);
|
f_v(returned_result);
|
||||||
#elif (TEST_SIG == 1)
|
#elif (TEST_SIG == 1)
|
||||||
f_f(r1, vfloat);
|
f_f(returned_result, vfloat);
|
||||||
#elif (TEST_SIG == 2)
|
#elif (TEST_SIG == 2)
|
||||||
f_fu(r1, vfloat, b);
|
f_fu(returned_result, vfloat, b);
|
||||||
#elif (TEST_SIG == 3)
|
#elif (TEST_SIG == 3)
|
||||||
f_fi(r1, vfloat, vint);
|
f_fi(returned_result, vfloat, vint);
|
||||||
#elif (TEST_SIG == 4)
|
#elif (TEST_SIG == 4)
|
||||||
f_du(r1, vdouble, 5.);
|
f_du(returned_result, vdouble, 5.);
|
||||||
#elif (TEST_SIG == 5)
|
#elif (TEST_SIG == 5)
|
||||||
f_duf(r1, vdouble, 5.f);
|
f_duf(returned_result, vdouble, 5.f);
|
||||||
#elif (TEST_SIG == 6)
|
#elif (TEST_SIG == 6)
|
||||||
f_di(r1, vdouble, vint2);
|
f_di(returned_result, vdouble, vint2);
|
||||||
#else
|
#else
|
||||||
#error "Unknown or unset TEST_SIG value"
|
#error "Unknown or unset TEST_SIG value"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float r2[16];
|
float expected_result[16];
|
||||||
memset(r2, 0, 16*sizeof(float));
|
memset(expected_result, 0, 16*sizeof(float));
|
||||||
result(r2);
|
result(expected_result);
|
||||||
|
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
for (int i = 0; i < w; ++i) {
|
for (int i = 0; i < w; ++i) {
|
||||||
if (r1[i] != r2[i]) {
|
if (returned_result[i] != expected_result[i]) {
|
||||||
#ifdef EXPECT_FAILURE
|
#ifdef EXPECT_FAILURE
|
||||||
// bingo, failed
|
// bingo, failed
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
printf("%s: value %d disagrees: should be %f [%a], returned %f [%a]\n",
|
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||||
argv[0], i, r1[i], r1[i], r2[i], r2[i]);
|
argv[0], i, returned_result[i], returned_result[i],
|
||||||
|
expected_result[i], expected_result[i]);
|
||||||
++errors;
|
++errors;
|
||||||
#endif // EXPECT_FAILURE
|
#endif // EXPECT_FAILURE
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user