From 16be1d313e88fe4e4407477837f78d7208d93ba8 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 25 Jul 2011 07:41:37 +0100 Subject: [PATCH] AVX updates / improvements. Add optimization patterns to detect and simplify masked loads and stores with the mask all on / all off. Enable AVX for LLVM 3.0 builds (still generally hits bugs / unimplemented stuff on the LLVM side, but it's getting there). --- main.cpp | 8 +++-- opt.cpp | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 6 deletions(-) diff --git a/main.cpp b/main.cpp index 13d0631e..a581ea77 100644 --- a/main.cpp +++ b/main.cpp @@ -91,7 +91,11 @@ static void usage(int ret) { printf(" disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n"); printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n"); printf(" disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n"); +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) + printf(" [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n"); +#else printf(" [--target={sse2,sse4,sse4x2}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n"); +#endif // LLVM 3.0 printf(" [--version]\t\t\t\tPrint ispc version\n"); printf(" [--woff]\t\t\t\tDisable warnings\n"); printf(" [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n"); @@ -118,13 +122,13 @@ static void lDoTarget(const char *target) { g->target.nativeVectorWidth = 4; g->target.vectorWidth = 8; } -#if 0 +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) else if (!strcasecmp(target, "avx")) { g->target.isa = Target::AVX; g->target.nativeVectorWidth = 8; g->target.vectorWidth = 8; } -#endif +#endif // LLVM 3.0 else usage(1); } diff --git a/opt.cpp b/opt.cpp index 1f7d4a8e..5626d9cf 100644 --- a/opt.cpp +++ b/opt.cpp @@ -304,6 +304,7 @@ Optimize(llvm::Module *module, int optLevel) { true /* simplify lib calls */, false /* may have exceptions */, llvm::createFunctionInliningPass()); + #else llvm::PassManagerBuilder builder; builder.OptLevel = 3; @@ -346,9 +347,9 @@ Optimize(llvm::Module *module, int optLevel) { /** This is a relatively simple optimization pass that does a few small optimizations that LLVM's x86 optimizer doesn't currently handle. (Specifically, MOVMSK of a constant can be replaced with the - corresponding constant value, and a BLENDVPS with either an 'all on' or - 'all off' blend factor can be replaced with the corredponding value of - one of the two operands. + corresponding constant value, BLENDVPS and AVX masked load/store with + either an 'all on' or 'all off' masks can be replaced with simpler + operations. @todo The better thing to do would be to submit a patch to LLVM to get these; they're presumably pretty simple patterns to match. @@ -408,8 +409,13 @@ IntrinsicsOpt::IntrinsicsOpt() llvm::Function *sseMovmsk = llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps); maskInstructions.push_back(sseMovmsk); - maskInstructions.push_back(m->module->getFunction("llvm.x86.avx.movmsk.ps")); maskInstructions.push_back(m->module->getFunction("__movmsk")); +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) + llvm::Function *avxMovmsk = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256); + assert(avxMovmsk != NULL); + maskInstructions.push_back(avxMovmsk); +#endif // And all of the blend instructions blendInstructions.push_back(BlendInstruction( @@ -494,6 +500,19 @@ lIsUndef(llvm::Value *value) { bool IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) + llvm::Function *avxMaskedLoad32 = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskload_ps_256); + llvm::Function *avxMaskedLoad64 = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskload_pd_256); + llvm::Function *avxMaskedStore32 = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_ps_256); + llvm::Function *avxMaskedStore64 = + llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_pd_256); + assert(avxMaskedLoad32 != NULL && avxMaskedStore32 != NULL); + assert(avxMaskedLoad64 != NULL && avxMaskedStore64 != NULL); +#endif + bool modifiedAny = false; restart: for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { @@ -564,6 +583,69 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { goto restart; } } +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) + else if (callInst->getCalledFunction() == avxMaskedLoad32 || + callInst->getCalledFunction() == avxMaskedLoad64) { + llvm::Value *factor = callInst->getArgOperand(1); + int mask = lGetMask(factor); + if (mask == 0) { + // nothing being loaded, replace with undef value + llvm::Type *returnType = callInst->getType(); + assert(llvm::isa(returnType)); + llvm::Value *undefValue = llvm::UndefValue::get(returnType); + llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), + iter, undefValue); + modifiedAny = true; + goto restart; + } + else if (mask == 0xff) { + // all lanes active; replace with a regular load + llvm::Type *returnType = callInst->getType(); + assert(llvm::isa(returnType)); + // cast the i8 * to the appropriate type + llvm::Value *castPtr = + new llvm::BitCastInst(callInst->getArgOperand(0), + llvm::PointerType::get(returnType, 0), + "ptr2vec", callInst); + lCopyMetadata(castPtr, callInst); + llvm::Instruction *loadInst = + new llvm::LoadInst(castPtr, "load", false /* not volatile */, + 0 /* align */, (llvm::Instruction *)NULL); + lCopyMetadata(loadInst, callInst); + llvm::ReplaceInstWithInst(callInst, loadInst); + modifiedAny = true; + goto restart; + } + } + else if (callInst->getCalledFunction() == avxMaskedStore32 || + callInst->getCalledFunction() == avxMaskedStore64) { + // NOTE: mask is the 2nd parameter, not the 3rd one!! + llvm::Value *factor = callInst->getArgOperand(1); + int mask = lGetMask(factor); + if (mask == 0) { + // nothing actually being stored, just remove the inst + callInst->eraseFromParent(); + modifiedAny = true; + goto restart; + } + else if (mask == 0xff) { + // all lanes storing, so replace with a regular store + llvm::Value *rvalue = callInst->getArgOperand(1); + llvm::Type *storeType = rvalue->getType(); + llvm::Value *castPtr = + new llvm::BitCastInst(callInst->getArgOperand(0), + llvm::PointerType::get(storeType, 0), + "ptr2vec", callInst); + lCopyMetadata(castPtr, callInst); + llvm::Instruction *storeInst = + new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); + lCopyMetadata(storeInst, callInst); + llvm::ReplaceInstWithInst(callInst, storeInst); + modifiedAny = true; + goto restart; + } + } +#endif } return modifiedAny; }