diff --git a/.gitignore b/.gitignore index 4df2d277..abbdbbe9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ depend ispc ispc_test objs +docs/doxygen diff --git a/stdlib-avx.ll b/stdlib-avx.ll index fff3719f..f24c146b 100644 --- a/stdlib-avx.ll +++ b/stdlib-avx.ll @@ -525,12 +525,53 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, } -define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>, - <8 x i32>) nounwind alwaysinline { - ; always just serialize it - ; FIXME: should implement the "do two 32-bit masked stores" stuff that - ; other targets do... - call void @__masked_store_64(<8 x i64>* nocapture %0, <8 x i64> %1, <8 x i32> %2) +define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, + <8 x i32> %i32mask) nounwind alwaysinline { + %oldValue = load <8 x i64>* %ptr, align 8 + %mask = bitcast <8 x i32> %i32mask to <8 x float> + + ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values + ; are actually bitcast <4 x i64> values + ; + ; set up the first four 64-bit values + %old01 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old01f = bitcast <4 x i64> %old01 to <8 x float> + %new01 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new01f = bitcast <4 x i64> %new01 to <8 x float> + ; compute mask--note that the indices are all doubled-up + %mask01 = shufflevector <8 x float> %mask, <8 x float> undef, + <8 x i32> + ; and blend them + %result01f = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %old01f, + <8 x float> %new01f, + <8 x float> %mask01) + %result01 = bitcast <8 x float> %result01f to <4 x i64> + + ; and again + %old23 = shufflevector <8 x i64> %oldValue, <8 x i64> undef, + <4 x i32> + %old23f = bitcast <4 x i64> %old23 to <8 x float> + %new23 = shufflevector <8 x i64> %new, <8 x i64> undef, + <4 x i32> + %new23f = bitcast <4 x i64> %new23 to <8 x float> + ; compute mask--note that the values are doubled-up... + %mask23 = shufflevector <8 x float> %mask, <8 x float> undef, + <8 x i32> + ; and blend them + %result23f = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %old23f, + <8 x float> %new23f, + <8 x float> %mask23) + %result23 = bitcast <8 x float> %result23f to <4 x i64> + + ; reconstruct the final <8 x i64> vector + %final = shufflevector <4 x i64> %result01, <4 x i64> %result23, + <8 x i32> + store <8 x i64> %final, <8 x i64> * %ptr, align 8 ret void }