Explicitly call the PBLENDVB intrinsic for i8 blending with sse4-8.

This is slightly cleaner than trunc-ing the i8 mask to i1 and using a vector select. (And is probably more safe in terms of good code.)
2013-07-24 15:10:08 -07:00
parent bba84f247c
commit 2d063925a1
1 changed files with 4 additions and 2 deletions
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -398,11 +398,13 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
  ret void
 }

+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
 define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
                                     <16 x MASK> %mask) nounwind alwaysinline {
-  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
  %old = load <16 x i8>* %0, align 4
-  %blend = select <16 x i1> %mask_as_i1, <16 x i8> %1, <16 x i8> %old
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
  store <16 x i8> %blend, <16 x i8>* %0, align 4
  ret void
 }