Add SSE4 target optimized for computation with 8-bit datatypes.

This change adds a new 'sse4-8' target, where programCount is 16 and
the mask element size is 8-bits.  (i.e. the most appropriate sizing of
the mask for SIMD computation with 8-bit datatypes.)
This commit is contained in:
Matt Pharr
2013-07-23 17:30:32 -07:00
parent 15a3ef370a
commit 53414f12e6
7 changed files with 578 additions and 7 deletions

13
opt.cpp
View File

@@ -670,14 +670,17 @@ IntrinsicsOpt::IntrinsicsOpt()
// All of the mask instructions we may encounter. Note that even if
// compiling for AVX, we may still encounter the regular 4-wide SSE
// MOVMSK instruction.
llvm::Function *sseMovmsk =
llvm::Function *ssei8Movmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
maskInstructions.push_back(ssei8Movmsk);
llvm::Function *sseFloatMovmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
maskInstructions.push_back(sseMovmsk);
maskInstructions.push_back(sseFloatMovmsk);
maskInstructions.push_back(m->module->getFunction("__movmsk"));
llvm::Function *avxMovmsk =
llvm::Function *avxFloatMovmsk =
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
Assert(avxMovmsk != NULL);
maskInstructions.push_back(avxMovmsk);
Assert(avxFloatMovmsk != NULL);
maskInstructions.push_back(avxFloatMovmsk);
// And all of the blend instructions
blendInstructions.push_back(BlendInstruction(