diff --git a/builtins-avx-x2.ll b/builtins-avx-x2.ll index b7f1d382..5128030a 100644 --- a/builtins-avx-x2.ll +++ b/builtins-avx-x2.ll @@ -232,8 +232,8 @@ define internal float @__reduce_add_float(<16 x float>) nounwind readonly always %v1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %va, <8 x float> %vb) %v2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v1, <8 x float> %v1) %v3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %v2, <8 x float> %v2) - %scalar1 = extractelement <8 x float> %v2, i32 0 - %scalar2 = extractelement <8 x float> %v2, i32 4 + %scalar1 = extractelement <8 x float> %v3, i32 0 + %scalar2 = extractelement <8 x float> %v3, i32 4 %sum = fadd float %scalar1, %scalar2 ret float %sum } @@ -316,7 +316,9 @@ define internal double @__reduce_add_double(<16 x double>) nounwind readonly alw %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %vab, <4 x double> %vcd) %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) - %sum = extractelement <4 x double> %sum1, i32 0 + %final0 = extractelement <4 x double> %sum1, i32 0 + %final1 = extractelement <4 x double> %sum1, i32 2 + %sum = fadd double %final0, %final1 ret double %sum } diff --git a/builtins-avx.ll b/builtins-avx.ll index 055fc7bb..2cfe3a81 100644 --- a/builtins-avx.ll +++ b/builtins-avx.ll @@ -294,10 +294,12 @@ define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwa <4 x i32> %v1 = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> - %sum01 = fadd <4 x double> %v0, %v1 - %red0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum01, <4 x double> %sum01) - %red1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %red0, <4 x double> %red0) - %sum = extractelement <4 x double> %red1, i32 0 + %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) + %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) + %final0 = extractelement <4 x double> %sum1, i32 0 + %final1 = extractelement <4 x double> %sum1, i32 2 + %sum = fadd double %final0, %final1 + ret double %sum } diff --git a/stdlib.ispc b/stdlib.ispc index 6b7ce67f..36d90313 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -2862,6 +2862,12 @@ static inline void seed_rng(reference uniform RNGState state, uniform unsigned i seed = __seed4(state, 0, seed); if (programCount == 8) __seed4(state, 4, seed ^ 0xbeeff00d); + if (programCount == 16) { + __seed4(state, 4, seed ^ 0xbeeff00d); + __seed4(state, 8, ((seed & 0xffff) << 16) | (seed >> 16)); + __seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) | + ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24)); + } } static inline void fastmath() { diff --git a/tests/cwhile-test-60.ispc b/tests/cwhile-test-60.ispc index a9233a49..4a29afe9 100644 --- a/tests/cwhile-test-60.ispc +++ b/tests/cwhile-test-60.ispc @@ -11,5 +11,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[programIndex] = 10; + RET[programIndex] = max(10, 1 + programIndex); } diff --git a/tests/popcnt-1.ispc b/tests/popcnt-1.ispc index 5b7a1349..9a64c113 100644 --- a/tests/popcnt-1.ispc +++ b/tests/popcnt-1.ispc @@ -9,7 +9,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { } export void result(uniform float RET[]) { - uniform int pc[16] = { 1, 1, 2, 1, 2, 2, 3, 1, 1, 2, 2, 3, 2, 3, 3, 4 }; + uniform int pc[16] = { 1, 1, 2, 1, + 2, 2, 3, 1, + 2, 2, 3, 2, + 3, 3, 4, 1 }; RET[programIndex] = pc[programIndex]; } diff --git a/tests/reduce-add-double-2.ispc b/tests/reduce-add-double-2.ispc index 4ffc6ddf..89c6a493 100644 --- a/tests/reduce-add-double-2.ispc +++ b/tests/reduce-add-double-2.ispc @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { uniform int x = -1234; if (programCount == 4) x = 10; else if (programCount == 8) x = 36; - else if (programCount == 16) x = 124; + else if (programCount == 16) x = 136; RET[programIndex] = x; } diff --git a/tests/reduce-add-float-2.ispc b/tests/reduce-add-float-2.ispc index 3e1368be..6cae8b44 100644 --- a/tests/reduce-add-float-2.ispc +++ b/tests/reduce-add-float-2.ispc @@ -15,7 +15,7 @@ export void result(uniform float RET[]) { uniform int x = -1234; if (programCount == 4) x = 10; else if (programCount == 8) x = 36; - else if (programCount == 16) x = 124; + else if (programCount == 16) x = 136; RET[programIndex] = x; } diff --git a/tests/short-vec-8.ispc b/tests/short-vec-8.ispc index dbe15975..0b8a3b3b 100644 --- a/tests/short-vec-8.ispc +++ b/tests/short-vec-8.ispc @@ -9,17 +9,6 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { uniform float<5> z = c ? x : y; RET[programIndex] = z[programIndex]; } -/*CO return x[y];*/ - -/*CO int index = aFOO[programIndex];*/ -/*CO index = min(index, 3);*/ -/*CO return x[index];*/ - -/*CO return x << 1;*/ -/*CO return c[0] ? 1 : 0;*/ -/*CO x = b;*/ -/*CO y = b;*/ -/*CO return x+y;*/ } export void result(uniform float RET[]) {