More AVX fixes
Fix RNG state initialization for 16-wide targets Fix a number of bugs in reduce_add builtin implementations for AVX. Fix some tests that had incorrect expected results for the 16-wide case.
This commit is contained in:
@@ -232,8 +232,8 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always
|
||||
%v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb)
|
||||
%v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1)
|
||||
%v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2)
|
||||
%scalar1 = extractelement <8 x float> %v2, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v2, i32 4
|
||||
%scalar1 = extractelement <8 x float> %v3, i32 0
|
||||
%scalar2 = extractelement <8 x float> %v3, i32 4
|
||||
%sum = fadd float %scalar1, %scalar2
|
||||
ret float %sum
|
||||
}
|
||||
@@ -316,7 +316,9 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw
|
||||
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%sum = extractelement <4 x double> %sum1, i32 0
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
|
||||
@@ -294,10 +294,12 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa
|
||||
<4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%v1 = shufflevector <8 x double> %0, <8 x double> undef,
|
||||
<4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%sum01 = fadd <4 x double> %v0, %v1
|
||||
%red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01)
|
||||
%red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0)
|
||||
%sum = extractelement <4 x double> %red1, i32 0
|
||||
%sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
|
||||
%sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
|
||||
%final0 = extractelement <4 x double> %sum1, i32 0
|
||||
%final1 = extractelement <4 x double> %sum1, i32 2
|
||||
%sum = fadd double %final0, %final1
|
||||
|
||||
ret double %sum
|
||||
}
|
||||
|
||||
|
||||
@@ -2862,6 +2862,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i
|
||||
seed = __seed4(state, 0, seed);
|
||||
if (programCount == 8)
|
||||
__seed4(state, 4, seed ^ 0xbeeff00d);
|
||||
if (programCount == 16) {
|
||||
__seed4(state, 4, seed ^ 0xbeeff00d);
|
||||
__seed4(state, 8, ((seed & 0xffff) << 16) | (seed >> 16));
|
||||
__seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) |
|
||||
((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void fastmath() {
|
||||
|
||||
@@ -11,5 +11,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 10;
|
||||
RET[programIndex] = max(10, 1 + programIndex);
|
||||
}
|
||||
|
||||
@@ -9,7 +9,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
uniform int pc[16] = { 1, 1, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 2, 3, 3, 4 };
|
||||
uniform int pc[16] = { 1, 1, 2, 1,
|
||||
2, 2, 3, 1,
|
||||
2, 2, 3, 2,
|
||||
3, 3, 4, 1 };
|
||||
RET[programIndex] = pc[programIndex];
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
|
||||
uniform int x = -1234;
|
||||
if (programCount == 4) x = 10;
|
||||
else if (programCount == 8) x = 36;
|
||||
else if (programCount == 16) x = 124;
|
||||
else if (programCount == 16) x = 136;
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
|
||||
uniform int x = -1234;
|
||||
if (programCount == 4) x = 10;
|
||||
else if (programCount == 8) x = 36;
|
||||
else if (programCount == 16) x = 124;
|
||||
else if (programCount == 16) x = 136;
|
||||
RET[programIndex] = x;
|
||||
}
|
||||
|
||||
|
||||
@@ -9,17 +9,6 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
uniform float<5> z = c ? x : y;
|
||||
RET[programIndex] = z[programIndex];
|
||||
}
|
||||
/*CO return x[y];*/
|
||||
|
||||
/*CO int index = aFOO[programIndex];*/
|
||||
/*CO index = min(index, 3);*/
|
||||
/*CO return x[index];*/
|
||||
|
||||
/*CO return x << 1;*/
|
||||
/*CO return c[0] ? 1 : 0;*/
|
||||
/*CO x = b;*/
|
||||
/*CO y = b;*/
|
||||
/*CO return x+y;*/
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
|
||||
Reference in New Issue
Block a user