Previously, to compute the size of objects and the offsets of struct elements within structs, we were using the trick of using getelementpointer with a NULL base pointer and then casting the result to an int32/64. However, since we actually know the target we're compiling for at compile time, we can use corresponding methods from TargetData to get these values directly. This mostly cleans up code, but may make some of the gather/scatter lowering to loads/stores optimizations work better in the presence of structures.
2730 lines
111 KiB
C++
2730 lines
111 KiB
C++
/*
|
|
Copyright (c) 2010-2011, Intel Corporation
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of Intel Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** @file opt.cpp
|
|
@brief Implementations of various ispc optimization passes that operate
|
|
on the LLVM IR.
|
|
*/
|
|
|
|
#include "opt.h"
|
|
#include "ctx.h"
|
|
#include "sym.h"
|
|
#include "module.h"
|
|
#include "util.h"
|
|
#include "llvmutil.h"
|
|
|
|
#include <stdio.h>
|
|
#include <map>
|
|
#include <set>
|
|
|
|
#include <llvm/Pass.h>
|
|
#include <llvm/Module.h>
|
|
#include <llvm/PassManager.h>
|
|
#include <llvm/PassRegistry.h>
|
|
#include <llvm/Assembly/PrintModulePass.h>
|
|
#include <llvm/Function.h>
|
|
#include <llvm/BasicBlock.h>
|
|
#include <llvm/Instructions.h>
|
|
#include <llvm/Intrinsics.h>
|
|
#include <llvm/Constants.h>
|
|
#include <llvm/Analysis/ConstantFolding.h>
|
|
#include <llvm/Target/TargetLibraryInfo.h>
|
|
#ifdef LLVM_2_9
|
|
#include <llvm/Support/StandardPasses.h>
|
|
#else
|
|
#include <llvm/Transforms/IPO/PassManagerBuilder.h>
|
|
#endif // LLVM_2_8
|
|
#include <llvm/ADT/Triple.h>
|
|
#include <llvm/Transforms/Scalar.h>
|
|
#include <llvm/Transforms/IPO.h>
|
|
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
|
|
#include <llvm/Target/TargetOptions.h>
|
|
#include <llvm/Target/TargetData.h>
|
|
#include <llvm/Target/TargetMachine.h>
|
|
#include <llvm/Analysis/Verifier.h>
|
|
#include <llvm/Support/raw_ostream.h>
|
|
#include <llvm/Analysis/DIBuilder.h>
|
|
#include <llvm/Analysis/DebugInfo.h>
|
|
#include <llvm/Support/Dwarf.h>
|
|
#ifdef ISPC_IS_LINUX
|
|
#include <alloca.h>
|
|
#elif defined(ISPC_IS_WINDOWS)
|
|
#include <malloc.h>
|
|
#define alloca _alloca
|
|
#endif // ISPC_IS_WINDOWS
|
|
|
|
static llvm::Pass *CreateIntrinsicsOptPass();
|
|
static llvm::Pass *CreateGatherScatterFlattenPass();
|
|
static llvm::Pass *CreateGatherScatterImprovementsPass();
|
|
static llvm::Pass *CreateLowerGatherScatterPass();
|
|
static llvm::Pass *CreateLowerMaskedStorePass();
|
|
static llvm::Pass *CreateMaskedStoreOptPass();
|
|
static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
|
|
static llvm::Pass *CreateMakeInternalFuncsStaticPass();
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
/** This utility routine copies the metadata (if any) attached to the
|
|
'from' instruction in the IR to the 'to' instruction.
|
|
|
|
For flexibility, this function takes an llvm::Value rather than an
|
|
llvm::Instruction for the 'to' parameter; at some places in the code
|
|
below, we sometimes use a llvm::Value to start out storing a value and
|
|
then later store instructions. If a llvm::Value is passed to this, the
|
|
routine just returns without doing anything; if it is in fact an
|
|
LLVM::Instruction, then the metadata can be copied to it.
|
|
*/
|
|
static void
|
|
lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
|
|
llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
|
|
if (!to)
|
|
return;
|
|
|
|
llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;
|
|
from->getAllMetadata(metadata);
|
|
for (unsigned int i = 0; i < metadata.size(); ++i)
|
|
to->setMetadata(metadata[i].first, metadata[i].second);
|
|
}
|
|
|
|
|
|
/** We have a protocol with the front-end LLVM IR code generation process
|
|
that allows us to encode the source file position that corresponds with
|
|
instructions. (For example, this allows us to issue performance
|
|
warnings related to things like scatter and gather after optimization
|
|
has been performed, so that we aren't warning about scatters and
|
|
gathers that have been improved to stores and loads by optimization
|
|
passes.) Note that this is slightly redundant with the source file
|
|
position encoding generated for debugging symbols, though we don't
|
|
always generate debugging information but we do always generate this
|
|
position data.
|
|
|
|
This function finds the SourcePos that the metadata in the instruction
|
|
(if present) corresponds to. See the implementation of
|
|
FunctionEmitContext::addGSMetadata(), which encodes the source position during
|
|
code generation.
|
|
|
|
@param inst Instruction to try to find the source position of
|
|
@param pos Output variable in which to store the position
|
|
@returns True if source file position metadata was present and *pos
|
|
has been set. False otherwise.
|
|
*/
|
|
static bool
|
|
lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
|
|
llvm::MDNode *filename = inst->getMetadata("filename");
|
|
llvm::MDNode *line = inst->getMetadata("line");
|
|
llvm::MDNode *column = inst->getMetadata("column");
|
|
if (!filename || !line || !column)
|
|
return false;
|
|
|
|
// All of these asserts are things that FunctionEmitContext::addGSMetadata() is
|
|
// expected to have done in its operation
|
|
assert(filename->getNumOperands() == 1 && line->getNumOperands() == 1);
|
|
llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
|
|
assert(str);
|
|
llvm::ConstantInt *lnum = llvm::dyn_cast<llvm::ConstantInt>(line->getOperand(0));
|
|
assert(lnum);
|
|
llvm::ConstantInt *colnum = llvm::dyn_cast<llvm::ConstantInt>(column->getOperand(0));
|
|
assert(column);
|
|
|
|
*pos = SourcePos(str->getString().data(), (int)lnum->getZExtValue(),
|
|
(int)colnum->getZExtValue());
|
|
return true;
|
|
}
|
|
|
|
|
|
/** Utility routine that prints out the LLVM IR for everything in the
|
|
module. (Used for debugging).
|
|
*/
|
|
static void
|
|
lPrintModuleCode(llvm::Module *module) {
|
|
llvm::PassManager ppm;
|
|
ppm.add(llvm::createPrintModulePass(&llvm::outs()));
|
|
ppm.run(*module);
|
|
}
|
|
|
|
|
|
void
|
|
Optimize(llvm::Module *module, int optLevel) {
|
|
if (g->debugPrint) {
|
|
printf("*** Code going into optimization ***\n");
|
|
lPrintModuleCode(module);
|
|
}
|
|
|
|
llvm::PassManager optPM;
|
|
llvm::FunctionPassManager funcPM(module);
|
|
|
|
llvm::TargetLibraryInfo *targetLibraryInfo =
|
|
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
|
|
optPM.add(targetLibraryInfo);
|
|
optPM.add(new llvm::TargetData(module));
|
|
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
optPM.add(llvm::createIndVarSimplifyPass());
|
|
#endif
|
|
|
|
if (optLevel == 0) {
|
|
// This is more or less the minimum set of optimizations that we
|
|
// need to do to generate code that will actually run. (We can't
|
|
// run absolutely no optimizations, since the front-end needs us to
|
|
// take the various __pseudo_* functions it has emitted and turn
|
|
// them into something that can actually execute.
|
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
|
optPM.add(CreateGatherScatterFlattenPass());
|
|
if (g->opt.disableHandlePseudoMemoryOps == false) {
|
|
optPM.add(CreateLowerGatherScatterPass());
|
|
optPM.add(CreateLowerMaskedStorePass());
|
|
}
|
|
optPM.add(CreateIsCompileTimeConstantPass(true));
|
|
optPM.add(llvm::createFunctionInliningPass());
|
|
optPM.add(CreateMakeInternalFuncsStaticPass());
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
optPM.add(llvm::createGlobalDCEPass());
|
|
}
|
|
else {
|
|
// Otherwise throw the kitchen sink of optimizations at the code.
|
|
// This is almost certainly overkill and likely could be reduced,
|
|
// but on the other hand trying to remove some of these has
|
|
// historically caused performance slowdowns. Benchmark carefully
|
|
// if changing these around.
|
|
//
|
|
// Note in particular that a number of the ispc optimization
|
|
// passes are run repeatedly along the way; they often can kick in
|
|
// only later in the optimization process as things like constant
|
|
// propagation have done their thing, and then when they do kick
|
|
// in, they can often open up new opportunities for optimization...
|
|
llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
|
|
llvm::initializeCore(*registry);
|
|
llvm::initializeScalarOpts(*registry);
|
|
llvm::initializeIPO(*registry);
|
|
llvm::initializeAnalysis(*registry);
|
|
llvm::initializeIPA(*registry);
|
|
llvm::initializeTransformUtils(*registry);
|
|
llvm::initializeInstCombine(*registry);
|
|
llvm::initializeInstrumentation(*registry);
|
|
llvm::initializeTarget(*registry);
|
|
|
|
// Early optimizations to try to reduce the total amount of code to
|
|
// work with if we can
|
|
optPM.add(CreateGatherScatterFlattenPass());
|
|
optPM.add(llvm::createReassociatePass());
|
|
optPM.add(llvm::createConstantPropagationPass());
|
|
|
|
if (!g->opt.disableMaskedStoreOptimizations) {
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
optPM.add(CreateMaskedStoreOptPass());
|
|
}
|
|
optPM.add(llvm::createDeadInstEliminationPass());
|
|
|
|
optPM.add(llvm::createConstantPropagationPass());
|
|
optPM.add(llvm::createDeadInstEliminationPass());
|
|
|
|
// On to more serious optimizations
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
optPM.add(llvm::createScalarReplAggregatesPass());
|
|
optPM.add(llvm::createInstructionCombiningPass());
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
|
optPM.add(llvm::createGlobalOptimizerPass());
|
|
optPM.add(llvm::createReassociatePass());
|
|
optPM.add(llvm::createIPConstantPropagationPass());
|
|
optPM.add(llvm::createDeadArgEliminationPass());
|
|
optPM.add(llvm::createInstructionCombiningPass());
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
optPM.add(llvm::createPruneEHPass());
|
|
optPM.add(llvm::createFunctionAttrsPass());
|
|
optPM.add(llvm::createFunctionInliningPass());
|
|
optPM.add(llvm::createConstantPropagationPass());
|
|
optPM.add(llvm::createDeadInstEliminationPass());
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
|
|
optPM.add(llvm::createArgumentPromotionPass());
|
|
optPM.add(llvm::createSimplifyLibCallsPass());
|
|
optPM.add(llvm::createInstructionCombiningPass());
|
|
optPM.add(llvm::createJumpThreadingPass());
|
|
optPM.add(llvm::createCFGSimplificationPass());
|
|
optPM.add(llvm::createScalarReplAggregatesPass());
|
|
optPM.add(llvm::createInstructionCombiningPass());
|
|
optPM.add(llvm::createTailCallEliminationPass());
|
|
|
|
if (!g->opt.disableMaskedStoreOptimizations) {
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
optPM.add(CreateMaskedStoreOptPass());
|
|
}
|
|
optPM.add(CreateLowerMaskedStorePass());
|
|
if (!g->opt.disableGatherScatterOptimizations)
|
|
optPM.add(CreateGatherScatterImprovementsPass());
|
|
if (g->opt.disableHandlePseudoMemoryOps == false) {
|
|
optPM.add(CreateLowerMaskedStorePass());
|
|
optPM.add(CreateLowerGatherScatterPass());
|
|
}
|
|
optPM.add(llvm::createFunctionInliningPass());
|
|
optPM.add(llvm::createConstantPropagationPass());
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
|
|
#if defined(LLVM_2_9)
|
|
llvm::createStandardModulePasses(&optPM, 3,
|
|
false /* opt size */,
|
|
true /* unit at a time */,
|
|
g->opt.unrollLoops,
|
|
true /* simplify lib calls */,
|
|
false /* may have exceptions */,
|
|
llvm::createFunctionInliningPass());
|
|
llvm::createStandardLTOPasses(&optPM, true /* internalize pass */,
|
|
true /* inline once again */,
|
|
false /* verify after each pass */);
|
|
llvm::createStandardFunctionPasses(&optPM, 3);
|
|
|
|
optPM.add(CreateIsCompileTimeConstantPass(true));
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
|
|
llvm::createStandardModulePasses(&optPM, 3,
|
|
false /* opt size */,
|
|
true /* unit at a time */,
|
|
g->opt.unrollLoops,
|
|
true /* simplify lib calls */,
|
|
false /* may have exceptions */,
|
|
llvm::createFunctionInliningPass());
|
|
|
|
#else
|
|
llvm::PassManagerBuilder builder;
|
|
builder.OptLevel = 3;
|
|
builder.Inliner = llvm::createFunctionInliningPass();
|
|
if (g->opt.unrollLoops == false)
|
|
builder.DisableUnrollLoops = true;
|
|
builder.populateFunctionPassManager(funcPM);
|
|
builder.populateModulePassManager(optPM);
|
|
optPM.add(CreateIsCompileTimeConstantPass(true));
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
builder.populateLTOPassManager(optPM, true /* internalize */,
|
|
true /* inline once again */);
|
|
optPM.add(CreateIsCompileTimeConstantPass(true));
|
|
optPM.add(CreateIntrinsicsOptPass());
|
|
builder.populateModulePassManager(optPM);
|
|
#endif
|
|
optPM.add(CreateMakeInternalFuncsStaticPass());
|
|
optPM.add(llvm::createGlobalDCEPass());
|
|
}
|
|
|
|
// Finish up by making sure we didn't mess anything up in the IR along
|
|
// the way.
|
|
optPM.add(llvm::createVerifierPass());
|
|
|
|
for (llvm::Module::iterator fiter = module->begin(); fiter != module->end();
|
|
++fiter)
|
|
funcPM.run(*fiter);
|
|
|
|
optPM.run(*module);
|
|
|
|
if (g->debugPrint) {
|
|
printf("\n*****\nFINAL OUTPUT\n*****\n");
|
|
lPrintModuleCode(module);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// IntrinsicsOpt
|
|
|
|
/** This is a relatively simple optimization pass that does a few small
|
|
optimizations that LLVM's x86 optimizer doesn't currently handle.
|
|
(Specifically, MOVMSK of a constant can be replaced with the
|
|
corresponding constant value, BLENDVPS and AVX masked load/store with
|
|
either an 'all on' or 'all off' masks can be replaced with simpler
|
|
operations.
|
|
|
|
@todo The better thing to do would be to submit a patch to LLVM to get
|
|
these; they're presumably pretty simple patterns to match.
|
|
*/
|
|
class IntrinsicsOpt : public llvm::BasicBlockPass {
|
|
public:
|
|
IntrinsicsOpt();
|
|
|
|
const char *getPassName() const { return "Intrinsics Cleanup Optimization"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
|
|
static char ID;
|
|
|
|
private:
|
|
struct MaskInstruction {
|
|
MaskInstruction(llvm::Function *f) { function = f; }
|
|
llvm::Function *function;
|
|
};
|
|
std::vector<MaskInstruction> maskInstructions;
|
|
|
|
/** Structure that records everything we need to know about a blend
|
|
instruction for this optimization pass.
|
|
*/
|
|
struct BlendInstruction {
|
|
BlendInstruction(llvm::Function *f, int ao, int o0, int o1, int of)
|
|
: function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
|
|
/** Function pointer for the blend instruction */
|
|
llvm::Function *function;
|
|
/** Mask value for an "all on" mask for this instruction */
|
|
int allOnMask;
|
|
/** The operand number in the llvm CallInst corresponds to the
|
|
first operand to blend with. */
|
|
int op0;
|
|
/** The operand number in the CallInst corresponding to the second
|
|
operand to blend with. */
|
|
int op1;
|
|
/** The operand in the call inst where the blending factor is
|
|
found. */
|
|
int opFactor;
|
|
};
|
|
std::vector<BlendInstruction> blendInstructions;
|
|
|
|
bool matchesMaskInstruction(llvm::Function *function);
|
|
BlendInstruction *matchingBlendInstruction(llvm::Function *function);
|
|
};
|
|
|
|
char IntrinsicsOpt::ID = 0;
|
|
llvm::RegisterPass<IntrinsicsOpt> sse("sse-constants", "Intrinsics Cleanup Pass");
|
|
|
|
|
|
IntrinsicsOpt::IntrinsicsOpt()
|
|
: BasicBlockPass(ID) {
|
|
|
|
// All of the mask instructions we may encounter. Note that even if
|
|
// compiling for AVX, we may still encounter the regular 4-wide SSE
|
|
// MOVMSK instruction.
|
|
llvm::Function *sseMovmsk =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
|
|
maskInstructions.push_back(sseMovmsk);
|
|
maskInstructions.push_back(m->module->getFunction("__movmsk"));
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::Function *avxMovmsk =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
|
|
assert(avxMovmsk != NULL);
|
|
maskInstructions.push_back(avxMovmsk);
|
|
#endif
|
|
|
|
// And all of the blend instructions
|
|
blendInstructions.push_back(BlendInstruction(
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse41_blendvps),
|
|
0xf, 0, 1, 2));
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
blendInstructions.push_back(BlendInstruction(
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_blendv_ps_256),
|
|
0xff, 0, 1, 2));
|
|
#endif
|
|
}
|
|
|
|
|
|
/** Given an llvm::Value represinting a vector mask, see if the value is a
|
|
constant. If so, return the integer mask found by taking the high bits
|
|
of the mask values in turn and concatenating them into a single integer.
|
|
In other words, given the 4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >,
|
|
we have 0b1001 = 9.
|
|
|
|
@todo This will break if we ever do 32-wide compilation, in which case
|
|
it don't be possible to distinguish between -1 for "don't know" and
|
|
"known and all bits on".
|
|
*/
|
|
static int
|
|
lGetMask(llvm::Value *factor) {
|
|
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
|
|
if (cv) {
|
|
int mask = 0;
|
|
llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
|
|
cv->getVectorElements(elements);
|
|
|
|
for (unsigned int i = 0; i < elements.size(); ++i) {
|
|
llvm::APInt intMaskValue;
|
|
// SSE has the "interesting" approach of encoding blending
|
|
// masks as <n x float>.
|
|
llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
|
|
if (cf) {
|
|
llvm::APFloat apf = cf->getValueAPF();
|
|
intMaskValue = apf.bitcastToAPInt();
|
|
}
|
|
else {
|
|
// Otherwise get it as an int
|
|
llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
|
|
assert(ci != NULL); // vs return -1 if NULL?
|
|
intMaskValue = ci->getValue();
|
|
}
|
|
// Is the high-bit set? If so, OR in the appropriate bit in
|
|
// the result mask
|
|
if (intMaskValue.countLeadingOnes() > 0)
|
|
mask |= (1 << i);
|
|
}
|
|
return mask;
|
|
}
|
|
else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
|
|
return 0;
|
|
else {
|
|
#if 0
|
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
|
|
if (ce != NULL) {
|
|
llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
|
|
const llvm::TargetData *td = targetMachine->getTargetData();
|
|
llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
|
|
c->dump();
|
|
factor = c;
|
|
}
|
|
// else we should be able to handle it above...
|
|
assert(!llvm::isa<llvm::Constant>(factor));
|
|
#endif
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
|
|
/** Given an llvm::Value, return true if we can determine that it's an
|
|
undefined value. This only makes a weak attempt at chasing this down,
|
|
only detecting flat-out undef values, and bitcasts of undef values.
|
|
|
|
@todo Is it worth working harder to find more of these? It starts to
|
|
get tricky, since having an undef operand doesn't necessarily mean that
|
|
the result will be undefined. (And for that matter, is there an LLVM
|
|
call that will do this for us?)
|
|
*/
|
|
static bool
|
|
lIsUndef(llvm::Value *value) {
|
|
if (llvm::isa<llvm::UndefValue>(value))
|
|
return true;
|
|
|
|
llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
|
|
if (bci)
|
|
return lIsUndef(bci->getOperand(0));
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
bool
|
|
IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::Function *avxMaskedLoad32 =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskload_ps_256);
|
|
llvm::Function *avxMaskedLoad64 =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskload_pd_256);
|
|
llvm::Function *avxMaskedStore32 =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_ps_256);
|
|
llvm::Function *avxMaskedStore64 =
|
|
llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_maskstore_pd_256);
|
|
assert(avxMaskedLoad32 != NULL && avxMaskedStore32 != NULL);
|
|
assert(avxMaskedLoad64 != NULL && avxMaskedStore64 != NULL);
|
|
#endif
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
if (!callInst)
|
|
continue;
|
|
|
|
BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
|
|
if (blend != NULL) {
|
|
llvm::Value *v[2] = { callInst->getArgOperand(blend->op0),
|
|
callInst->getArgOperand(blend->op1) };
|
|
llvm::Value *factor = callInst->getArgOperand(blend->opFactor);
|
|
|
|
// If the values are the same, then no need to blend..
|
|
if (v[0] == v[1]) {
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, v[0]);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
// If one of the two is undefined, we're allowed to replace
|
|
// with the value of the other. (In other words, the only
|
|
// valid case is that the blend factor ends up having a value
|
|
// that only selects from the defined one of the two operands,
|
|
// otherwise the result is undefined and any value is fine,
|
|
// ergo the defined one is an acceptable result.)
|
|
if (lIsUndef(v[0])) {
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, v[1]);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
if (lIsUndef(v[1])) {
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, v[0]);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
int mask = lGetMask(factor);
|
|
llvm::Value *value = NULL;
|
|
if (mask == 0)
|
|
// Mask all off -> replace with the first blend value
|
|
value = v[0];
|
|
else if (mask == blend->allOnMask)
|
|
// Mask all on -> replace with the second blend value
|
|
value = v[1];
|
|
|
|
if (value != NULL) {
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, value);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
else if (matchesMaskInstruction(callInst->getCalledFunction())) {
|
|
llvm::Value *factor = callInst->getArgOperand(0);
|
|
int mask = lGetMask(factor);
|
|
if (mask != -1) {
|
|
// If the vector-valued mask has a known value, replace it
|
|
// with the corresponding integer mask from its elements
|
|
// high bits.
|
|
llvm::Value *value = LLVMInt32(mask);
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, value);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
|
|
callInst->getCalledFunction() == avxMaskedLoad64) {
|
|
llvm::Value *factor = callInst->getArgOperand(1);
|
|
int mask = lGetMask(factor);
|
|
if (mask == 0) {
|
|
// nothing being loaded, replace with undef value
|
|
llvm::Type *returnType = callInst->getType();
|
|
assert(llvm::isa<llvm::VectorType>(returnType));
|
|
llvm::Value *undefValue = llvm::UndefValue::get(returnType);
|
|
llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
|
|
iter, undefValue);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
else if (mask == 0xff) {
|
|
// all lanes active; replace with a regular load
|
|
llvm::Type *returnType = callInst->getType();
|
|
assert(llvm::isa<llvm::VectorType>(returnType));
|
|
// cast the i8 * to the appropriate type
|
|
llvm::Value *castPtr =
|
|
new llvm::BitCastInst(callInst->getArgOperand(0),
|
|
llvm::PointerType::get(returnType, 0),
|
|
"ptr2vec", callInst);
|
|
lCopyMetadata(castPtr, callInst);
|
|
int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
|
|
llvm::Instruction *loadInst =
|
|
new llvm::LoadInst(castPtr, "load", false /* not volatile */,
|
|
align, (llvm::Instruction *)NULL);
|
|
lCopyMetadata(loadInst, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, loadInst);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
else if (callInst->getCalledFunction() == avxMaskedStore32 ||
|
|
callInst->getCalledFunction() == avxMaskedStore64) {
|
|
// NOTE: mask is the 2nd parameter, not the 3rd one!!
|
|
llvm::Value *factor = callInst->getArgOperand(1);
|
|
int mask = lGetMask(factor);
|
|
if (mask == 0) {
|
|
// nothing actually being stored, just remove the inst
|
|
callInst->eraseFromParent();
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
else if (mask == 0xff) {
|
|
// all lanes storing, so replace with a regular store
|
|
llvm::Value *rvalue = callInst->getArgOperand(2);
|
|
llvm::Type *storeType = rvalue->getType();
|
|
llvm::Value *castPtr =
|
|
new llvm::BitCastInst(callInst->getArgOperand(0),
|
|
llvm::PointerType::get(storeType, 0),
|
|
"ptr2vec", callInst);
|
|
lCopyMetadata(castPtr, callInst);
|
|
|
|
llvm::StoreInst *storeInst =
|
|
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
|
|
int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
|
|
storeInst->setAlignment(align);
|
|
lCopyMetadata(storeInst, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, storeInst);
|
|
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
bool
|
|
IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
|
|
for (unsigned int i = 0; i < maskInstructions.size(); ++i)
|
|
if (maskInstructions[i].function != NULL &&
|
|
function == maskInstructions[i].function)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
|
|
IntrinsicsOpt::BlendInstruction *
|
|
IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
|
|
for (unsigned int i = 0; i < blendInstructions.size(); ++i)
|
|
if (blendInstructions[i].function != NULL &&
|
|
function == blendInstructions[i].function)
|
|
return &blendInstructions[i];
|
|
return NULL;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateIntrinsicsOptPass() {
|
|
return new IntrinsicsOpt;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// GatherScatterFlattenOpt
|
|
|
|
/** When the front-end emits gathers and scatters, it generates an array of
|
|
vector-width pointers to represent the set of addresses to read from or
|
|
write to. However, because ispc doesn't support pointers, it turns
|
|
out to be the case that scatters and gathers always end up indexing
|
|
into an array with a common base pointer. Therefore, this optimization
|
|
transforms the original arrays of general pointers into a single base
|
|
pointer and an array of offsets.
|
|
|
|
(Implementation seems to be easier with this approach versus having the
|
|
front-end try to emit base pointer + offset stuff from the start,
|
|
though arguably the latter approach would be a little more elegant.)
|
|
|
|
See for example the comments discussing the __pseudo_gather functions
|
|
in builtins.cpp for more information about this.
|
|
|
|
@todo The implementation of this is pretty messy, and it sure would be
|
|
nice to not have all the complexity of built-in assumptions of the
|
|
structure of how the front end will have generated code, all of the
|
|
instruction dyn_casts, etc. Can we do something simpler, e.g. an early
|
|
pass to flatten out GEPs when the size is known, then do LLVM's
|
|
constant folding, then flatten into an array, etc.?
|
|
*/
|
|
class GatherScatterFlattenOpt : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
GatherScatterFlattenOpt() : BasicBlockPass(ID) { }
|
|
|
|
const char *getPassName() const { return "Gather/Scatter Flattening"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
};
|
|
|
|
char GatherScatterFlattenOpt::ID = 0;
|
|
|
|
llvm::RegisterPass<GatherScatterFlattenOpt> gsf("gs-flatten", "Gather/Scatter Flatten Pass");
|
|
|
|
|
|
/** Given an llvm::Value known to be an unsigned integer, return its value as
|
|
an int64_t.
|
|
*/
|
|
static uint64_t
|
|
lGetIntValue(llvm::Value *offset) {
|
|
llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
|
|
assert(intOffset && (intOffset->getBitWidth() == 32 ||
|
|
intOffset->getBitWidth() == 64));
|
|
return intOffset->getZExtValue();
|
|
}
|
|
|
|
|
|
static llvm::Value *
|
|
lGetTypeSize(LLVM_TYPE_CONST llvm::Type *type, llvm::Instruction *insertBefore) {
|
|
LLVM_TYPE_CONST llvm::ArrayType *arrayType =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
|
|
if (arrayType != NULL)
|
|
type = arrayType->getElementType();
|
|
|
|
llvm::Value *scale = g->target.SizeOf(type);
|
|
if (g->target.is32bit == false) {
|
|
scale = new llvm::TruncInst(scale, LLVMTypes::Int32Type, "sizeof32",
|
|
insertBefore);
|
|
lCopyMetadata(scale, insertBefore);
|
|
}
|
|
return scale;
|
|
}
|
|
|
|
|
|
static llvm::Value *
|
|
lTraverseConstantExpr(llvm::Constant *value, llvm::Value **offsetPtr,
|
|
LLVM_TYPE_CONST llvm::Type **scaleType,
|
|
llvm::Instruction *insertBefore) {
|
|
llvm::GlobalVariable *gv = NULL;
|
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(value);
|
|
if (ce != NULL) {
|
|
switch (ce->getOpcode()) {
|
|
case llvm::Instruction::BitCast:
|
|
*offsetPtr = LLVMInt32(0);
|
|
return lTraverseConstantExpr(ce->getOperand(0), offsetPtr,
|
|
scaleType, insertBefore);
|
|
case llvm::Instruction::GetElementPtr: {
|
|
gv = llvm::dyn_cast<llvm::GlobalVariable>(ce->getOperand(0));
|
|
assert(gv != NULL);
|
|
|
|
assert(lGetIntValue(ce->getOperand(1)) == 0);
|
|
LLVM_TYPE_CONST llvm::PointerType *targetPtrType =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ce->getOperand(0)->getType());
|
|
assert(targetPtrType);
|
|
LLVM_TYPE_CONST llvm::Type *targetType = targetPtrType->getElementType();
|
|
|
|
if (llvm::isa<const llvm::StructType>(targetType)) {
|
|
*offsetPtr = g->target.StructOffset(targetType,
|
|
lGetIntValue(ce->getOperand(2)));
|
|
*scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
|
|
}
|
|
else {
|
|
*offsetPtr = ce->getOperand(2);
|
|
assert(*scaleType == NULL || *scaleType == targetType);
|
|
*scaleType = targetType;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
FATAL("Unexpected opcode in constant expression!");
|
|
//printf("other op %s\n", ce->getOpcodeName());
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (gv == NULL)
|
|
gv = llvm::dyn_cast<llvm::GlobalVariable>(value);
|
|
|
|
return gv;
|
|
}
|
|
|
|
|
|
static llvm::Value *
|
|
lGetOffsetForLane(int lane, llvm::Value *value, llvm::Value **offset,
|
|
LLVM_TYPE_CONST llvm::Type **scaleType,
|
|
llvm::Instruction *insertBefore) {
|
|
if (!llvm::isa<llvm::GetElementPtrInst>(value)) {
|
|
assert(llvm::isa<llvm::BitCastInst>(value));
|
|
value = llvm::dyn_cast<llvm::BitCastInst>(value)->getOperand(0);
|
|
|
|
llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(value);
|
|
assert(ev->hasIndices() && ev->getNumIndices() == 1);
|
|
assert(int(*(ev->idx_begin())) == lane);
|
|
|
|
llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(ev->getOperand(0));
|
|
assert(iv->hasIndices() && iv->getNumIndices() == 1);
|
|
while (int(*(iv->idx_begin())) != lane) {
|
|
iv = llvm::dyn_cast<llvm::InsertValueInst>(iv->getOperand(0));
|
|
assert(iv && iv->hasIndices() && iv->getNumIndices() == 1);
|
|
}
|
|
|
|
value = iv->getOperand(1);
|
|
}
|
|
|
|
llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(value);
|
|
assert(gep);
|
|
|
|
assert(lGetIntValue(gep->getOperand(1)) == 0);
|
|
LLVM_TYPE_CONST llvm::PointerType *targetPtrType =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(gep->getOperand(0)->getType());
|
|
assert(targetPtrType);
|
|
LLVM_TYPE_CONST llvm::Type *targetType = targetPtrType->getElementType();
|
|
|
|
if (llvm::isa<const llvm::StructType>(targetType)) {
|
|
*offset = g->target.StructOffset(targetType, lGetIntValue(gep->getOperand(2)));
|
|
lCopyMetadata(*offset, insertBefore);
|
|
*scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
|
|
}
|
|
else {
|
|
*offset = gep->getOperand(2);
|
|
assert(*scaleType == NULL || *scaleType == targetType);
|
|
*scaleType = targetType;
|
|
}
|
|
|
|
llvm::ExtractValueInst *ee =
|
|
llvm::dyn_cast<llvm::ExtractValueInst>(gep->getOperand(0));
|
|
if (ee == NULL) {
|
|
// found the base pointer, here it is...
|
|
return gep->getOperand(0);
|
|
}
|
|
else {
|
|
assert(ee->hasIndices() && ee->getNumIndices() == 1 &&
|
|
int(*(ee->idx_begin())) == lane);
|
|
llvm::InsertValueInst *iv =
|
|
llvm::dyn_cast<llvm::InsertValueInst>(ee->getOperand(0));
|
|
assert(iv != NULL);
|
|
// do this chain of inserts for the next dimension...
|
|
return iv;
|
|
}
|
|
}
|
|
|
|
|
|
/** We have an LLVM array of pointer values, where each pointer has been
|
|
computed with a GEP from some common base pointer value. This function
|
|
deconstructs the LLVM array, storing the offset from the base pointer
|
|
as an llvm::Value for the i'th element into the i'th element of the
|
|
offsets[] array passed in to the function. It returns a scale factor
|
|
for the offsets via *scaleType. The return value is either the base
|
|
pointer or the an array of pointers for the next dimension of indexing
|
|
(that we'll in turn deconstruct with this function).
|
|
*/
|
|
static llvm::Value *
|
|
lTraverseInsertChain(llvm::Value *ptrs, llvm::Value *offsets[ISPC_MAX_NVEC],
|
|
LLVM_TYPE_CONST llvm::Type **scaleType,
|
|
llvm::Instruction *insertBefore) {
|
|
// The pointer values may be an array of constant pointers (this
|
|
// happens, for example, when indexing into global arrays.) In that
|
|
// case, we have llvm::ConstantExprs to deconstruct to dig out the
|
|
// common base pointer and the per-lane offsets.
|
|
llvm::ConstantArray *ca = llvm::dyn_cast<llvm::ConstantArray>(ptrs);
|
|
if (ca != NULL) {
|
|
assert((int)ca->getNumOperands() == g->target.vectorWidth);
|
|
llvm::Value *base = NULL;
|
|
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
|
llvm::Value *b = lTraverseConstantExpr(ca->getOperand(i), &offsets[i],
|
|
scaleType, insertBefore);
|
|
if (i == 0)
|
|
base = b;
|
|
else
|
|
assert(base == b);
|
|
}
|
|
return base;
|
|
}
|
|
|
|
// This depends on the front-end constructing the arrays of pointers
|
|
// via InsertValue instructions. (Which it does do in
|
|
// FunctionEmitContext::GetElementPtrInst()).
|
|
llvm::InsertValueInst *ivInst = llvm::dyn_cast<llvm::InsertValueInst>(ptrs);
|
|
assert(ivInst != NULL);
|
|
|
|
// We have a chain of insert value instructions where each instruction
|
|
// sets one of the elements of the array and where the input array is
|
|
// either the base pointer or another insert value instruction. Here
|
|
// we talk through all of the insert value instructions until we hit
|
|
// the end.
|
|
llvm::Value *nextChain = NULL;
|
|
while (ivInst != NULL) {
|
|
// Figure out which array index this current instruction is setting
|
|
// the value of.
|
|
assert(ivInst->hasIndices() && ivInst->getNumIndices() == 1);
|
|
int elementIndex = *(ivInst->idx_begin());
|
|
assert(elementIndex >= 0 && elementIndex < g->target.vectorWidth);
|
|
// We shouldn't have already seen something setting the value for
|
|
// this index.
|
|
assert(offsets[elementIndex] == NULL);
|
|
|
|
// Set offsets[elementIndex] here. This returns the value from
|
|
// which the GEP operation was done; this should either be the base
|
|
// pointer or an insert value chain for another dimension of the
|
|
// array being indexed into.
|
|
llvm::Value *myNext = lGetOffsetForLane(elementIndex, ivInst->getOperand(1),
|
|
&offsets[elementIndex], scaleType,
|
|
insertBefore);
|
|
if (nextChain == NULL)
|
|
nextChain = myNext;
|
|
else
|
|
// All of these insert value instructions should have the same
|
|
// base value
|
|
assert(nextChain == myNext);
|
|
|
|
// Do we have another element of the array to process?
|
|
llvm::Value *nextInsert = ivInst->getOperand(0);
|
|
ivInst = llvm::dyn_cast<llvm::InsertValueInst>(nextInsert);
|
|
if (!ivInst)
|
|
assert(llvm::isa<llvm::UndefValue>(nextInsert));
|
|
}
|
|
return nextChain;
|
|
}
|
|
|
|
|
|
/** Given a scalar value, return a vector of width g->target.vectorWidth
|
|
that has the scalar replicated across each of its elements.
|
|
|
|
@todo Using shufflevector to do this seems more idiomatic (and would be
|
|
just a single instruction). Switch to that?
|
|
*/
|
|
static llvm::Value *
|
|
lSmearScalar(llvm::Value *scalar, llvm::Instruction *insertBefore) {
|
|
LLVM_TYPE_CONST llvm::Type *vectorType = llvm::VectorType::get(scalar->getType(),
|
|
g->target.vectorWidth);
|
|
llvm::Value *result = llvm::UndefValue::get(vectorType);
|
|
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
|
result = llvm::InsertElementInst::Create(result, scalar, LLVMInt32(i),
|
|
"smearinsert", insertBefore);
|
|
lCopyMetadata(result, insertBefore);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
static llvm::Value *
|
|
lGetPtrAndOffsets(llvm::Value *ptrs, llvm::Value **basePtr,
|
|
llvm::Instruction *insertBefore, int eltSize) {
|
|
llvm::Value *offset = LLVMInt32Vector(0);
|
|
|
|
while (ptrs != NULL) {
|
|
llvm::Value *offsets[ISPC_MAX_NVEC];
|
|
for (int i = 0; i < g->target.vectorWidth; ++i)
|
|
offsets[i] = NULL;
|
|
LLVM_TYPE_CONST llvm::Type *scaleType = NULL;
|
|
|
|
llvm::Value *nextChain =
|
|
lTraverseInsertChain(ptrs, offsets, &scaleType, insertBefore);
|
|
|
|
for (int i = 0; i < g->target.vectorWidth; ++i)
|
|
assert(offsets[i] != NULL);
|
|
llvm::Value *delta = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
|
|
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
|
delta = llvm::InsertElementInst::Create(delta, offsets[i],
|
|
LLVMInt32(i), "dim",
|
|
insertBefore);
|
|
lCopyMetadata(delta, insertBefore);
|
|
}
|
|
|
|
llvm::Value *size = lGetTypeSize(scaleType, insertBefore);
|
|
|
|
llvm::Value *scale = lSmearScalar(size, insertBefore);
|
|
delta = llvm::BinaryOperator::Create(llvm::Instruction::Mul, delta,
|
|
scale, "delta_scale", insertBefore);
|
|
lCopyMetadata(delta, insertBefore);
|
|
offset = llvm::BinaryOperator::Create(llvm::Instruction::Add, offset,
|
|
delta, "offset_delta",
|
|
insertBefore);
|
|
lCopyMetadata(offset, insertBefore);
|
|
|
|
if (llvm::dyn_cast<llvm::InsertValueInst>(nextChain))
|
|
ptrs = nextChain;
|
|
else {
|
|
// else we don't have a unique starting pointer....
|
|
assert(*basePtr == NULL || *basePtr == nextChain);
|
|
*basePtr = nextChain;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return offset;
|
|
}
|
|
|
|
|
|
struct GSInfo {
|
|
GSInfo(const char *pgFuncName, const char *pgboFuncName, bool ig, int es)
|
|
: isGather(ig), elementSize(es) {
|
|
func = m->module->getFunction(pgFuncName);
|
|
baseOffsetsFunc = m->module->getFunction(pgboFuncName);
|
|
}
|
|
llvm::Function *func;
|
|
llvm::Function *baseOffsetsFunc;
|
|
const bool isGather;
|
|
const int elementSize;
|
|
};
|
|
|
|
|
|
bool
|
|
GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
GSInfo gsFuncs[] = {
|
|
GSInfo("__pseudo_gather_8", "__pseudo_gather_base_offsets_8", true, 1),
|
|
GSInfo("__pseudo_gather_16", "__pseudo_gather_base_offsets_16", true, 2),
|
|
GSInfo("__pseudo_gather_32", "__pseudo_gather_base_offsets_32", true, 4),
|
|
GSInfo("__pseudo_gather_64", "__pseudo_gather_base_offsets_64", true, 8),
|
|
GSInfo("__pseudo_scatter_8", "__pseudo_scatter_base_offsets_8", false, 1),
|
|
GSInfo("__pseudo_scatter_16", "__pseudo_scatter_base_offsets_16", false, 2),
|
|
GSInfo("__pseudo_scatter_32", "__pseudo_scatter_base_offsets_32", false, 4),
|
|
GSInfo("__pseudo_scatter_64", "__pseudo_scatter_base_offsets_64", false, 8),
|
|
};
|
|
int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
|
|
for (int i = 0; i < numGSFuncs; ++i)
|
|
assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL);
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
// Iterate through all of the instructions in the basic block.
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
// If we don't have a call to one of the
|
|
// __pseudo_{gather,scatter}_* functions, then just go on to the
|
|
// next instruction.
|
|
if (callInst == NULL)
|
|
continue;
|
|
GSInfo *info = NULL;
|
|
for (int i = 0; i < numGSFuncs; ++i)
|
|
if (gsFuncs[i].func != NULL &&
|
|
callInst->getCalledFunction() == gsFuncs[i].func) {
|
|
info = &gsFuncs[i];
|
|
break;
|
|
}
|
|
if (info == NULL)
|
|
continue;
|
|
|
|
// Transform the array of pointers to a single base pointer and an
|
|
// array of int32 offsets. (All the hard work is done by
|
|
// lGetPtrAndOffsets).
|
|
llvm::Value *ptrs = callInst->getArgOperand(0);
|
|
llvm::Value *basePtr = NULL;
|
|
llvm::Value *offsetVector = lGetPtrAndOffsets(ptrs, &basePtr, callInst,
|
|
info->elementSize);
|
|
// Cast the base pointer to a void *, since that's what the
|
|
// __pseudo_*_base_offsets_* functions want.
|
|
basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType,
|
|
"base2void", callInst);
|
|
lCopyMetadata(basePtr, callInst);
|
|
|
|
if (info->isGather) {
|
|
llvm::Value *mask = callInst->getArgOperand(1);
|
|
|
|
// Generate a new function call to the next pseudo gather
|
|
// base+offsets instruction. Note that we're passing a NULL
|
|
// llvm::Instruction to llvm::CallInst::Create; this means that
|
|
// the instruction isn't inserted into a basic block and that
|
|
// way we can then call ReplaceInstWithInst().
|
|
llvm::Value *newArgs[3] = { basePtr, offsetVector, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray,
|
|
"newgather", (llvm::Instruction *)NULL);
|
|
#else
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], &newArgs[3],
|
|
"newgather");
|
|
#endif
|
|
lCopyMetadata(newCall, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, newCall);
|
|
}
|
|
else {
|
|
llvm::Value *mask = callInst->getArgOperand(2);
|
|
llvm::Value *rvalue = callInst->getArgOperand(1);
|
|
|
|
// Generate a new function call to the next pseudo scatter
|
|
// base+offsets instruction. See above for why passing NULL
|
|
// for the Instruction * is intended.
|
|
llvm::Value *newArgs[4] = { basePtr, offsetVector, rvalue, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray, "",
|
|
(llvm::Instruction *)NULL);
|
|
#else
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0],
|
|
&newArgs[4]);
|
|
#endif
|
|
lCopyMetadata(newCall, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, newCall);
|
|
}
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateGatherScatterFlattenPass() {
|
|
return new GatherScatterFlattenOpt;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// MaskedStoreOptPass
|
|
|
|
/** Masked stores are generally more complex than regular stores; for
|
|
example, they require multiple instructions to simulate under SSE.
|
|
This optimization detects cases where masked stores can be replaced
|
|
with regular stores or removed entirely, for the cases of an 'all on'
|
|
mask and an 'all off' mask, respectively.
|
|
*/
|
|
class MaskedStoreOptPass : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
MaskedStoreOptPass() : BasicBlockPass(ID) { }
|
|
|
|
const char *getPassName() const { return "Masked Store Scalarize"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
};
|
|
|
|
|
|
char MaskedStoreOptPass::ID = 0;
|
|
|
|
llvm::RegisterPass<MaskedStoreOptPass> mss("masked-store-scalarize",
|
|
"Masked Store Scalarize Pass");
|
|
|
|
struct MSInfo {
|
|
MSInfo(const char *name, const int a)
|
|
: align(a) {
|
|
func = m->module->getFunction(name);
|
|
assert(func != NULL);
|
|
}
|
|
llvm::Function *func;
|
|
const int align;
|
|
};
|
|
|
|
|
|
bool
|
|
MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
MSInfo msInfo[] = {
|
|
MSInfo("__pseudo_masked_store_8", 1),
|
|
MSInfo("__pseudo_masked_store_16", 2),
|
|
MSInfo("__pseudo_masked_store_32", 4),
|
|
MSInfo("__pseudo_masked_store_64", 8),
|
|
MSInfo("__masked_store_blend_8", 1),
|
|
MSInfo("__masked_store_blend_16", 2),
|
|
MSInfo("__masked_store_blend_32", 4),
|
|
MSInfo("__masked_store_blend_64", 8),
|
|
MSInfo("__masked_store_8", 1),
|
|
MSInfo("__masked_store_16", 2),
|
|
MSInfo("__masked_store_32", 4),
|
|
MSInfo("__masked_store_64", 8)
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
// Iterate over all of the instructions to look for one of the various
|
|
// masked store functions
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
if (!callInst)
|
|
continue;
|
|
|
|
llvm::Function *called = callInst->getCalledFunction();
|
|
int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
|
|
MSInfo *info = NULL;
|
|
for (int i = 0; i < nMSFuncs; ++i) {
|
|
if (msInfo[i].func != NULL && called == msInfo[i].func) {
|
|
info = &msInfo[i];
|
|
break;
|
|
}
|
|
}
|
|
if (info == NULL)
|
|
continue;
|
|
|
|
// Got one; grab the operands
|
|
llvm::Value *lvalue = callInst->getArgOperand(0);
|
|
llvm::Value *rvalue = callInst->getArgOperand(1);
|
|
llvm::Value *mask = callInst->getArgOperand(2);
|
|
|
|
int allOnMask = (1 << g->target.vectorWidth) - 1;
|
|
|
|
int maskAsInt = lGetMask(mask);
|
|
if (maskAsInt == 0) {
|
|
// Zero mask - no-op, so remove the store completely. (This
|
|
// may in turn lead to being able to optimize out instructions
|
|
// that compute the rvalue...)
|
|
callInst->eraseFromParent();
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
else if (maskAsInt == allOnMask) {
|
|
// The mask is all on, so turn this into a regular store
|
|
LLVM_TYPE_CONST llvm::Type *rvalueType = rvalue->getType();
|
|
LLVM_TYPE_CONST llvm::Type *ptrType =
|
|
llvm::PointerType::get(rvalueType, 0);
|
|
|
|
lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
|
|
lCopyMetadata(lvalue, callInst);
|
|
llvm::Instruction *store =
|
|
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
|
|
info->align);
|
|
lCopyMetadata(store, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, store);
|
|
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateMaskedStoreOptPass() {
|
|
return new MaskedStoreOptPass;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// LowerMaskedStorePass
|
|
|
|
/** When the front-end needs to do a masked store, it emits a
|
|
__pseudo_masked_store_{8,16,32,64} call as a placeholder. This pass
|
|
lowers these calls to either __masked_store_{8,16,32,64} or
|
|
__masked_store_blend_{8,16,32,64} calls.
|
|
*/
|
|
class LowerMaskedStorePass : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
LowerMaskedStorePass() : BasicBlockPass(ID) { }
|
|
|
|
const char *getPassName() const { return "Lower Masked Stores"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
};
|
|
|
|
|
|
char LowerMaskedStorePass::ID = 0;
|
|
|
|
llvm::RegisterPass<LowerMaskedStorePass> lms("masked-store-lower",
|
|
"Lower Masked Store Pass");
|
|
|
|
|
|
/** This routine attempts to determine if the given pointer in lvalue is
|
|
pointing to stack-allocated memory. It's conservative in that it
|
|
should never return true for non-stack allocated memory, but may return
|
|
false for memory that actually is stack allocated. The basic strategy
|
|
is to traverse through the operands and see if the pointer originally
|
|
comes from an AllocaInst.
|
|
*/
|
|
static bool
|
|
lIsStackVariablePointer(llvm::Value *lvalue) {
|
|
llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
|
|
if (bc)
|
|
return lIsStackVariablePointer(bc->getOperand(0));
|
|
else {
|
|
llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
|
|
if (ai)
|
|
return true;
|
|
else {
|
|
llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
|
|
if (gep)
|
|
return lIsStackVariablePointer(gep->getOperand(0));
|
|
else
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
struct LMSInfo {
|
|
LMSInfo(const char *pname, const char *bname, const char *msname) {
|
|
pseudoFunc = m->module->getFunction(pname);
|
|
blendFunc = m->module->getFunction(bname);
|
|
maskedStoreFunc = m->module->getFunction(msname);
|
|
assert(pseudoFunc != NULL && blendFunc != NULL &&
|
|
maskedStoreFunc != NULL);
|
|
}
|
|
llvm::Function *pseudoFunc;
|
|
llvm::Function *blendFunc;
|
|
llvm::Function *maskedStoreFunc;
|
|
};
|
|
|
|
|
|
bool
|
|
LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
LMSInfo msInfo[] = {
|
|
LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8",
|
|
"__masked_store_8"),
|
|
LMSInfo("__pseudo_masked_store_16", "__masked_store_blend_16",
|
|
"__masked_store_16"),
|
|
LMSInfo("__pseudo_masked_store_32", "__masked_store_blend_32",
|
|
"__masked_store_32"),
|
|
LMSInfo("__pseudo_masked_store_64", "__masked_store_blend_64",
|
|
"__masked_store_64")
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
// Iterate through all of the instructions and look for
|
|
// __pseudo_masked_store_* calls.
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
if (callInst == NULL)
|
|
continue;
|
|
LMSInfo *info = NULL;
|
|
for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
|
|
if (msInfo[i].pseudoFunc != NULL &&
|
|
callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
|
|
info = &msInfo[i];
|
|
break;
|
|
}
|
|
}
|
|
if (info == NULL)
|
|
continue;
|
|
|
|
llvm::Value *lvalue = callInst->getArgOperand(0);
|
|
llvm::Value *rvalue = callInst->getArgOperand(1);
|
|
llvm::Value *mask = callInst->getArgOperand(2);
|
|
|
|
// We need to choose between doing the load + blend + store trick,
|
|
// or serializing the masked store. Even on targets with a native
|
|
// masked store instruction, this is preferable since it lets us
|
|
// keep values in registers rather than going out to the stack.
|
|
bool doBlend = (!g->opt.disableBlendedMaskedStores ||
|
|
lIsStackVariablePointer(lvalue));
|
|
|
|
// Generate the call to the appropriate masked store function and
|
|
// replace the __pseudo_* one with it.
|
|
llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
|
|
llvm::Value *args[3] = { lvalue, rvalue, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
|
|
llvm::Instruction *inst = llvm::CallInst::Create(fms, newArgArray, "",
|
|
callInst);
|
|
#else
|
|
llvm::Instruction *inst = llvm::CallInst::Create(fms, &args[0], &args[3], "",
|
|
callInst);
|
|
#endif
|
|
lCopyMetadata(inst, callInst);
|
|
|
|
callInst->eraseFromParent();
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateLowerMaskedStorePass() {
|
|
return new LowerMaskedStorePass;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// GSImprovementsPass
|
|
|
|
/** After earlier optimization passes have run, we are sometimes able to
|
|
determine that gathers/scatters are actually accessing memory in a more
|
|
regular fashion and then change the operation to something simpler and
|
|
more efficient. For example, if all of the lanes in a gather are
|
|
reading from the same location, we can instead do a scalar load and
|
|
broadcast. This pass examines gathers and scatters and tries to
|
|
simplify them if at all possible.
|
|
|
|
@todo Currently, this only looks for all program instances going to the
|
|
same location and all going to a linear sequence of locations in
|
|
memory. There are a number of other cases that might make sense to
|
|
look for, including things that could be handled with a vector load +
|
|
shuffle or things that could be handled with hybrids of e.g. 2 4-wide
|
|
vector loads with AVX, etc.
|
|
*/
|
|
class GSImprovementsPass : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
GSImprovementsPass() : BasicBlockPass(ID) { }
|
|
|
|
const char *getPassName() const { return "Gather/Scatter Improvements"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
};
|
|
|
|
|
|
char GSImprovementsPass::ID = 0;
|
|
|
|
llvm::RegisterPass<GSImprovementsPass> gsi("gs-improvements",
|
|
"Gather/Scatter Improvements Pass");
|
|
|
|
|
|
#if 0
|
|
// Debugging routine: dump the values of all of the elmenets in a
|
|
// flattened-out vector
|
|
static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC]) {
|
|
fprintf(stderr, "--- %s ---\n", info);
|
|
for (int i = 0; i < g->target.vectorWidth; ++i) {
|
|
fprintf(stderr, "%d: ", i);
|
|
elements[i]->dump();
|
|
}
|
|
fprintf(stderr, "-----\n");
|
|
}
|
|
#endif
|
|
|
|
|
|
/** Given an LLVM vector in vec, return a 'scalarized' version of the
|
|
vector in the provided scalarizedVector[] array. For example, if the
|
|
vector value passed in is:
|
|
|
|
add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
|
|
|
|
and if %a_smear was computed by replicating a scalar value i32 %a
|
|
across all of the elements of %a_smear, then the values returned will
|
|
be:
|
|
|
|
offsets[0] = add i32 %a, i32 4
|
|
offsets[1] = add i32 %a, i32 8
|
|
offsets[2] = add i32 %a, i32 12
|
|
offsets[3] = add i32 %a, i32 16
|
|
|
|
This function isn't fully general, but it seems to be able to handle
|
|
all of the patterns that currently arise in practice. If it can't
|
|
scalarize a vector value, then it just returns false and the calling
|
|
code proceeds as best it can without this information.
|
|
|
|
@param vec Vector to be scalarized
|
|
@param scalarizedVector Array in which to store the individual vector
|
|
elements
|
|
@param vectorLength Number of elements in the given vector. (The
|
|
passed scalarizedVector array must also be at least
|
|
this length as well.)
|
|
@param phiMap STL map from pointers to PHINodes that we've already
|
|
scalarized to the array of Value *s that they were
|
|
scalarized into.
|
|
@returns True if the vector was successfully scalarized and
|
|
the values in offsets[] are valid; false otherwise
|
|
*/
|
|
static bool
|
|
lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
|
|
int vectorLength, std::map<llvm::PHINode *, llvm::Value **> &phiMap) {
|
|
// First initialize the values of scalarizedVector[] to NULL.
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
scalarizedVector[i] = NULL;
|
|
|
|
// It may be ok for the vector to be an undef vector; these come up for
|
|
// example in shufflevector instructions. As long as elements of the
|
|
// undef vector aren't referenced by the shuffle indices, this is fine.
|
|
if (llvm::isa<llvm::UndefValue>(vec))
|
|
return true;
|
|
|
|
// ConstantVectors are easy; just pull out the individual constant
|
|
// element values
|
|
llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
|
|
if (cv != NULL) {
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
scalarizedVector[i] = cv->getOperand(i);
|
|
return true;
|
|
}
|
|
|
|
// It's also easy if it's just a vector of all zeros
|
|
llvm::ConstantAggregateZero *caz =
|
|
llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
|
|
if (caz != NULL) {
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
scalarizedVector[i] = LLVMInt32(0);
|
|
return true;
|
|
}
|
|
|
|
llvm::BinaryOperator *bo = llvm::dyn_cast<llvm::BinaryOperator>(vec);
|
|
if (bo) {
|
|
// BinaryOperators are handled by attempting to scalarize both of
|
|
// the operands. If we're successful at this, then the vector of
|
|
// scalar values we return from here are synthesized with scalar
|
|
// versions of the original vector binary operator
|
|
llvm::Instruction::BinaryOps opcode = bo->getOpcode();
|
|
llvm::Value **v0 =
|
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
llvm::Value **v1 =
|
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
|
|
if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength, phiMap) ||
|
|
!lScalarizeVector(bo->getOperand(1), v1, vectorLength, phiMap))
|
|
return false;
|
|
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
scalarizedVector[i] =
|
|
llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
|
|
lCopyMetadata(scalarizedVector[i], bo);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(vec);
|
|
if (ie != NULL) {
|
|
// If we have an InsertElement instruction, we generally have a
|
|
// chain along the lines of:
|
|
//
|
|
// %v0 = insertelement undef, value_0, i32 index_0
|
|
// %v1 = insertelement %v1, value_1, i32 index_1
|
|
// ...
|
|
// %vn = insertelement %vn-1, value_n-1, i32 index_n-1
|
|
//
|
|
// We start here witn %vn and work backwards through the chain of
|
|
// insertelement instructions until we get to the undef value that
|
|
// started it all. At each instruction, we set the appropriate
|
|
// vaue in scalarizedVector[] based on the value being inserted.
|
|
while (ie != NULL) {
|
|
uint64_t iOffset = lGetIntValue(ie->getOperand(2));
|
|
assert((int)iOffset < vectorLength);
|
|
assert(scalarizedVector[iOffset] == NULL);
|
|
|
|
scalarizedVector[iOffset] = ie->getOperand(1);
|
|
|
|
llvm::Value *insertBase = ie->getOperand(0);
|
|
ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
|
|
if (!ie)
|
|
assert(llvm::isa<llvm::UndefValue>(insertBase));
|
|
}
|
|
return true;
|
|
}
|
|
|
|
llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
|
|
if (ci != NULL) {
|
|
// Casts are similar to BinaryOperators in that we attempt to
|
|
// scalarize the vector being cast and if successful, we apply
|
|
// equivalent scalar cast operators to each of the values in the
|
|
// scalarized vector.
|
|
llvm::Instruction::CastOps op = ci->getOpcode();
|
|
|
|
llvm::Value **scalarizedTarget =
|
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
|
|
vectorLength, phiMap))
|
|
return false;
|
|
|
|
LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
|
|
LLVM_TYPE_CONST llvm::VectorType *vectorDestType =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(destType);
|
|
assert(vectorDestType != NULL);
|
|
LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();
|
|
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
scalarizedVector[i] =
|
|
llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
|
|
"cast", ci);
|
|
lCopyMetadata(scalarizedVector[i], ci);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
|
|
if (svi != NULL) {
|
|
LLVM_TYPE_CONST llvm::VectorType *svInstType =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
|
|
assert(svInstType != NULL);
|
|
assert((int)svInstType->getNumElements() == vectorLength);
|
|
|
|
// Scalarize the two vectors being shuffled. First figure out how
|
|
// big they are.
|
|
LLVM_TYPE_CONST llvm::Type *type0 = svi->getOperand(0)->getType();
|
|
LLVM_TYPE_CONST llvm::Type *type1 = svi->getOperand(1)->getType();
|
|
LLVM_TYPE_CONST llvm::VectorType *vectorType0 =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type0);
|
|
LLVM_TYPE_CONST llvm::VectorType *vectorType1 =
|
|
llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type1);
|
|
assert(vectorType0 != NULL && vectorType1 != NULL);
|
|
|
|
int n0 = vectorType0->getNumElements();
|
|
int n1 = vectorType1->getNumElements();
|
|
|
|
// Go ahead and scalarize the two input vectors now.
|
|
llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
|
|
llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
|
|
|
|
if (!lScalarizeVector(svi->getOperand(0), v0, n0, phiMap) ||
|
|
!lScalarizeVector(svi->getOperand(1), v1, n1, phiMap))
|
|
return false;
|
|
|
|
llvm::ConstantAggregateZero *caz =
|
|
llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
|
|
if (caz != NULL) {
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
scalarizedVector[i] = v0[0];
|
|
}
|
|
else {
|
|
llvm::ConstantVector *shuffleIndicesVector =
|
|
llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
|
|
// I think this has to be a ConstantVector. If this ever hits,
|
|
// we'll dig into what we got instead and figure out how to handle
|
|
// that...
|
|
assert(shuffleIndicesVector != NULL);
|
|
|
|
// Get the integer indices for each element of the returned vector
|
|
llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
|
|
shuffleIndicesVector->getVectorElements(shuffleIndices);
|
|
assert((int)shuffleIndices.size() == vectorLength);
|
|
|
|
// And loop over the indices, setting the i'th element of the
|
|
// result vector with the source vector element that corresponds to
|
|
// the i'th shuffle index value.
|
|
for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
|
|
// I'm not sure when this case would ever happen, though..
|
|
assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
|
|
|
|
int offset = (int)lGetIntValue(shuffleIndices[i]);
|
|
assert(offset >= 0 && offset < n0+n1);
|
|
|
|
if (offset < n0)
|
|
// Offsets from 0 to n0-1 index into the first vector
|
|
scalarizedVector[i] = v0[offset];
|
|
else
|
|
// And offsets from n0 to (n0+n1-1) index into the second
|
|
// vector
|
|
scalarizedVector[i] = v1[offset - n0];
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
|
|
if (li != NULL) {
|
|
llvm::Value *baseAddr = li->getOperand(0);
|
|
LLVM_TYPE_CONST llvm::Type *intPtrType = g->target.is32bit ?
|
|
LLVMTypes::Int32Type : LLVMTypes::Int64Type;
|
|
llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, intPtrType,
|
|
"base2int", li);
|
|
lCopyMetadata(baseInt, li);
|
|
|
|
LLVM_TYPE_CONST llvm::PointerType *ptrType =
|
|
llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
|
|
assert(ptrType != NULL);
|
|
LLVM_TYPE_CONST llvm::VectorType *vecType =
|
|
llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
|
|
assert(vecType != NULL);
|
|
LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
|
|
|
|
llvm::Value *elementSize = g->target.SizeOf(elementType);
|
|
|
|
LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
|
|
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::Value *offset =
|
|
llvm::BinaryOperator::Create(llvm::Instruction::Mul, elementSize,
|
|
g->target.is32bit ? LLVMInt32(i) :
|
|
LLVMInt64(i),
|
|
"elt_offset", li);
|
|
llvm::Value *intPtrOffset =
|
|
llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
|
|
offset, "baseoffset", li);
|
|
lCopyMetadata(intPtrOffset, li);
|
|
llvm::Value *scalarLoadPtr =
|
|
new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
|
|
lCopyMetadata(scalarLoadPtr, li);
|
|
|
|
llvm::Instruction *scalarLoad =
|
|
new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
|
|
lCopyMetadata(scalarLoad, li);
|
|
scalarizedVector[i] = scalarLoad;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(vec);
|
|
if (phi != NULL) {
|
|
// If we've seen this phi node during an earlier recursive step,
|
|
// then don't re-scalarize it, but return the already allocated
|
|
// pointer values. (This both avoids an infinite loop and ensures
|
|
// that we point back to ourself properly for cases where some of
|
|
// the incoming values end up being derived from the phi node...
|
|
if (phiMap.find(phi) != phiMap.end()) {
|
|
llvm::Value **v = phiMap[phi];
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
scalarizedVector[i] = v[i];
|
|
return true;
|
|
}
|
|
|
|
LLVM_TYPE_CONST llvm::VectorType *vecType =
|
|
llvm::dyn_cast<llvm::VectorType>(phi->getType());
|
|
LLVM_TYPE_CONST llvm::Type *eltType = vecType->getElementType();
|
|
assert(vecType != NULL);
|
|
unsigned int numIncoming = phi->getNumIncomingValues();
|
|
|
|
// First allocate all of the scalarized phi nodes, so that we can
|
|
// get them into the map<> before making recursive calls to
|
|
// lScalarizeVector.
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
scalarizedVector[i] =
|
|
llvm::PHINode::Create(eltType, numIncoming, "phi", phi);
|
|
#else
|
|
scalarizedVector[i] =
|
|
llvm::PHINode::Create(eltType, "phi", phi);
|
|
#endif // LLVM_3_0
|
|
lCopyMetadata(scalarizedVector[i], phi);
|
|
}
|
|
|
|
phiMap[phi] = scalarizedVector;
|
|
|
|
llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
// Now, for each incoming value, scalarize it recursively and then
|
|
// update the scalarized phi node for this element.
|
|
for (unsigned int i = 0; i < numIncoming; ++i) {
|
|
if (!lScalarizeVector(phi->getIncomingValue(i), vin, vectorLength, phiMap))
|
|
return false;
|
|
llvm::BasicBlock *bbin = phi->getIncomingBlock(i);
|
|
|
|
for (int j = 0; j < vectorLength; ++j) {
|
|
llvm::PHINode *phi = (llvm::PHINode *)scalarizedVector[j];
|
|
phi->addIncoming(vin[j], bbin);
|
|
}
|
|
}
|
|
|
|
phiMap.erase(phiMap.find(phi));
|
|
return true;
|
|
}
|
|
|
|
#if 0
|
|
fprintf(stderr, "flatten vector fixme\n");
|
|
vec->dump();
|
|
assert(0);
|
|
#endif
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/** Conservative test to see if two values are equal. There are
|
|
(potentially many) cases where the two values actually are equal but
|
|
this will return false. However, if it does return true, the two
|
|
vectors definitely are equal.
|
|
|
|
@todo This seems to catch all of the cases we currently need it for in
|
|
practice, but it's be nice to make it a little more robust/general. In
|
|
general, though, a little something called the halting problem means we
|
|
won't get all of them.
|
|
*/
|
|
static bool
|
|
lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
|
|
std::vector<llvm::PHINode *> &seenPhi0,
|
|
std::vector<llvm::PHINode *> &seenPhi1) {
|
|
// Thanks to the fact that LLVM hashes and returns the same pointer for
|
|
// constants (of all sorts, even constant expressions), this first test
|
|
// actually catches a lot of cases. LLVM's SSA form also helps a lot
|
|
// with this..
|
|
if (v0 == v1)
|
|
return true;
|
|
|
|
assert(seenPhi0.size() == seenPhi1.size());
|
|
for (unsigned int i = 0; i < seenPhi0.size(); ++i)
|
|
if (v0 == seenPhi0[i] && v1 == seenPhi1[i])
|
|
return true;
|
|
|
|
llvm::BinaryOperator *bo0 = llvm::dyn_cast<llvm::BinaryOperator>(v0);
|
|
llvm::BinaryOperator *bo1 = llvm::dyn_cast<llvm::BinaryOperator>(v1);
|
|
if (bo0 != NULL && bo1 != NULL) {
|
|
if (bo0->getOpcode() != bo1->getOpcode())
|
|
return false;
|
|
return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0),
|
|
seenPhi0, seenPhi1) &&
|
|
lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1),
|
|
seenPhi0, seenPhi1));
|
|
}
|
|
|
|
llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
|
|
llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
|
|
if (phi0 != NULL && phi1 != NULL) {
|
|
if (phi0->getNumIncomingValues() !=
|
|
phi1->getNumIncomingValues())
|
|
return false;
|
|
|
|
seenPhi0.push_back(phi0);
|
|
seenPhi1.push_back(phi1);
|
|
|
|
unsigned int numIncoming = phi0->getNumIncomingValues();
|
|
// Check all of the incoming values: if all of them are all equal,
|
|
// then we're good.
|
|
bool anyFailure = false;
|
|
for (unsigned int i = 0; i < numIncoming; ++i) {
|
|
assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i));
|
|
if (!lValuesAreEqual(phi0->getIncomingValue(i),
|
|
phi1->getIncomingValue(i), seenPhi0, seenPhi1)) {
|
|
anyFailure = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
seenPhi0.pop_back();
|
|
seenPhi1.pop_back();
|
|
|
|
return !anyFailure;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/** Tests to see if all of the llvm::Values in the array are equal. Like
|
|
lValuesAreEqual, this is a conservative test and may return false for
|
|
arrays where the values are actually all equal.
|
|
*/
|
|
static bool
|
|
lVectorValuesAllEqual(llvm::Value **v, int vectorLength) {
|
|
std::vector<llvm::PHINode *> seenPhi0, seenPhi1;
|
|
for (int i = 0; i < vectorLength-1; ++i)
|
|
if (!lValuesAreEqual(v[i], v[i+1], seenPhi0, seenPhi1))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
|
|
/** Given an array of scalar integer values, test to see if they are a
|
|
linear sequence of compile-time constant integers starting from an
|
|
arbirary value but then having a step of value "stride" between
|
|
elements.
|
|
*/
|
|
static bool
|
|
lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) {
|
|
llvm::ConstantInt *prev = llvm::dyn_cast<llvm::ConstantInt>(v[0]);
|
|
if (!prev)
|
|
return false;
|
|
int prevVal = (int)prev->getZExtValue();
|
|
|
|
// For each element in the array, see if it is both a ConstantInt and
|
|
// if the difference between it and the value of the previous element
|
|
// is stride. If not, fail.
|
|
for (int i = 1; i < vectorLength; ++i) {
|
|
llvm::ConstantInt *next = llvm::dyn_cast<llvm::ConstantInt>(v[i]);
|
|
if (!next)
|
|
return false;
|
|
|
|
int nextVal = (int)next->getZExtValue();
|
|
if (prevVal + stride != nextVal)
|
|
return false;
|
|
|
|
prevVal = nextVal;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
/** Given an array of integer-typed values, see if the elements of the
|
|
array have a step of 'stride' between their values. This function
|
|
tries to handle as many possibilities as possible, including things
|
|
like all elements equal to some non-constant value plus an integer
|
|
offset, etc.
|
|
|
|
@todo FIXME Crazy thought: can we just build up expressions that
|
|
subtract the constants [v[0], v[0]+stride, v[0]+2*stride, ...] from the
|
|
given values, throw the LLVM optimizer at those, and then see if we get
|
|
back an array of all zeros?
|
|
*/
|
|
static bool
|
|
lVectorIsLinear(llvm::Value **v, int vectorLength, int stride,
|
|
std::set<llvm::PHINode *> &seenPhis) {
|
|
#if 0
|
|
lPrintVector("called lVectorIsLinear", v);
|
|
#endif
|
|
|
|
// First try the easy case: if the values are all just constant
|
|
// integers and have the expected stride between them, then we're done.
|
|
if (lVectorIsLinearConstantInts(v, vectorLength, stride))
|
|
return true;
|
|
|
|
// ConstantExprs need a bit of deconstruction to figure out
|
|
|
|
// FIXME: do we need to handle cases where e.g. v[0] is an
|
|
// llvm::ConstantInt and then the rest are ConstExprs??
|
|
if (llvm::dyn_cast<llvm::ConstantExpr>(v[0])) {
|
|
// First, see if all of the array elements are ConstantExprs of
|
|
// some sort. If not, give up.
|
|
// FIXME: are we potentially missing cases here, e.g. a mixture of
|
|
// ConstantExprs and ConstantInts?
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
if (!llvm::isa<llvm::ConstantExpr>(v[i]))
|
|
return false;
|
|
}
|
|
|
|
// See if any of the array elements are adds of constant
|
|
// expressions. As it turns out, LLVM's constant expression
|
|
// optimizer is very thorough about converting "add(0, foo)" to
|
|
// "foo", so we need to deal with cases where element 0 is "foo",
|
|
// element 1 is add(4, foo), etc...
|
|
bool anyAdds = false, allAdds = true;
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
|
|
if (ce->getOpcode() == llvm::Instruction::Add ||
|
|
ce->getOpcode() == llvm::Instruction::Sub)
|
|
anyAdds = true;
|
|
else
|
|
allAdds = false;
|
|
}
|
|
|
|
if (anyAdds && !allAdds) {
|
|
// In v[], we should have an array of elements that are all
|
|
// either ConstExprs with add operators, where one of the
|
|
// operads is a constant int, or other non-add ConstExpr
|
|
// values.
|
|
//
|
|
// Now we through each element and:
|
|
// 1. For ones that aren't add ConstExprs, treat them as if they
|
|
// are an add with 0 as the other operand.
|
|
// 2. Extract the ConstInt operand of the add into the intBit[]
|
|
// array and put the other operand in the otherBit[] array.
|
|
llvm::Value **intBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
llvm::Value **otherBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
|
|
if (ce->getOpcode() == llvm::Instruction::Add) {
|
|
// The ConstantInt may be either of the two operands of
|
|
// the add. Put the operands in the right arrays.
|
|
if (llvm::isa<llvm::ConstantInt>(ce->getOperand(0))) {
|
|
intBit[i] = ce->getOperand(0);
|
|
otherBit[i] = ce->getOperand(1);
|
|
}
|
|
else {
|
|
intBit[i] = ce->getOperand(1);
|
|
otherBit[i] = ce->getOperand(0);
|
|
}
|
|
}
|
|
else if (ce->getOpcode() == llvm::Instruction::Sub) {
|
|
// Treat subtraction as an add with a negative value..
|
|
if (llvm::isa<llvm::ConstantInt>(ce->getOperand(0))) {
|
|
intBit[i] = ce->getOperand(0);
|
|
otherBit[i] = llvm::ConstantExpr::getNeg(ce->getOperand(1));
|
|
}
|
|
else {
|
|
intBit[i] = ce->getOperand(1);
|
|
otherBit[i] = llvm::ConstantExpr::getNeg(ce->getOperand(0));
|
|
}
|
|
}
|
|
else {
|
|
// We don't have an Add or a Sub, so pretend we have an
|
|
// add with zero.
|
|
intBit[i] = LLVMInt32(0);
|
|
otherBit[i] = v[i];
|
|
}
|
|
}
|
|
|
|
// Now that everything is lined up, see if we have a case where
|
|
// we're adding constant values with the desired stride to the
|
|
// same base value. If so, we know we have a linear set of
|
|
// locations.
|
|
return (lVectorIsLinear(intBit, vectorLength, stride, seenPhis) &&
|
|
lVectorValuesAllEqual(otherBit, vectorLength));
|
|
}
|
|
|
|
// If this ever hits, the assertion can just be commented out and
|
|
// false returned below. However, it's worth figuring out how the
|
|
// analysis needs to be generalized rather than necessarily giving
|
|
// up and possibly hurting performance of the final code.
|
|
FATAL("Unexpected case with a ConstantExpr in lVectorIsLinear");
|
|
#if 0
|
|
for (int i = 0; i < vectorLength; ++i)
|
|
v[i]->dump();
|
|
FATAL("FIXME");
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
|
|
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[0]);
|
|
if (bop) {
|
|
// We also need to deal with non-constant binary operators that
|
|
// represent linear accesses here..
|
|
// FIXME: here, too, what about cases with v[0] being a load or something
|
|
// and then everything after element 0 being a binary operator with an add.
|
|
// That won't get caught by this case??
|
|
bool anyAdd = false, anySub = false;
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::BinaryOperator *bopi = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
|
|
if (bopi) {
|
|
if (bopi->getOpcode() == llvm::Instruction::Add)
|
|
anyAdd = true;
|
|
else if (bopi->getOpcode() == llvm::Instruction::Sub)
|
|
anySub = true;
|
|
}
|
|
}
|
|
|
|
if (anyAdd && anySub)
|
|
return false;
|
|
if (anyAdd || anySub) {
|
|
// is one of the operands the same for all elements? if so, then just
|
|
// need to check this case for the other operand...
|
|
|
|
// FIXME: do we need a more general check that starts with both
|
|
// the first and second operand of v[0]'s add and then checks
|
|
// the remainder of the elements to see if either one of their
|
|
// two operands matches the one we started with? That would be
|
|
// more robust to switching the ordering of operands, in case
|
|
// that ever happens...
|
|
llvm::Value **addSubOperandValues =
|
|
(llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
|
|
for (int operand = 0; operand <= 1; ++operand) {
|
|
// Go through the vector elements and grab the operand'th
|
|
// one if this is an add or the v
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
|
|
if (bop->getOpcode() == llvm::Instruction::Add ||
|
|
bop->getOpcode() == llvm::Instruction::Sub)
|
|
addSubOperandValues[i] = bop->getOperand(operand);
|
|
else
|
|
// The other guys are adds or subtracts, so we'll
|
|
// treat this as an "add 0" in the below, so just
|
|
// grab the value v[i] itself
|
|
addSubOperandValues[i] = v[i];
|
|
}
|
|
|
|
if (lVectorValuesAllEqual(addSubOperandValues, vectorLength) &&
|
|
(anyAdd || operand == 1)) {
|
|
// If this operand's values are all equal, then the
|
|
// overall result is an ascending linear sequence if
|
|
// the other operand's values are themselves a linear
|
|
// sequence and if either this is an add or we're
|
|
// looking at the 2nd operand. i.e.:
|
|
//
|
|
// unif + programIndex -> ascending linear sequence
|
|
// programIndex + unif -> ascending linear seqeuence
|
|
// programIndex - unif -> ascending linear seqeuence
|
|
// unif - programIndex -> *descending* linear seqeuence
|
|
//
|
|
// We don't match the descending case for now; at some
|
|
// future point we could generate code for that as a
|
|
// vector load + shuffle.
|
|
int otherOperand = operand ^ 1;
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
|
|
if (bop->getOpcode() == llvm::Instruction::Add ||
|
|
bop->getOpcode() == llvm::Instruction::Sub)
|
|
addSubOperandValues[i] = bop->getOperand(otherOperand);
|
|
else
|
|
addSubOperandValues[i] = LLVMInt32(0);
|
|
}
|
|
return lVectorIsLinear(addSubOperandValues, vectorLength, stride, seenPhis);
|
|
}
|
|
}
|
|
}
|
|
else if (bop->getOpcode() == llvm::Instruction::Mul) {
|
|
// Finally, if we have a multiply, then if one of the operands
|
|
// has the same value for all elements and if the other operand
|
|
// is a linear sequence such that the scale times the sequence
|
|
// values is a linear sequence with the desired stride, then
|
|
// we're good.
|
|
llvm::ConstantInt *op0 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(0));
|
|
llvm::ConstantInt *op1 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(1));
|
|
|
|
// We need one of them to be a constant for us to be able to proceed...
|
|
if (!op0 && !op1)
|
|
return false;
|
|
// But if they're both constants, then the LLVM constant folder
|
|
// should have simplified them down to their product!
|
|
assert(!(op0 && op1));
|
|
|
|
// Figure out which operand number is the constant scale and
|
|
// which is the varying one
|
|
int scaleOperand, otherOperand;
|
|
llvm::ConstantInt *scaleValue;
|
|
if (op0 != NULL) {
|
|
scaleOperand = 0;
|
|
otherOperand = 1;
|
|
scaleValue = op0;
|
|
}
|
|
else {
|
|
scaleOperand = 1;
|
|
otherOperand = 0;
|
|
scaleValue = op1;
|
|
}
|
|
|
|
// Find the scale value; make sure it evenly divides the
|
|
// stride. Otherwise there's no chance that the scale times a
|
|
// set of integer values will give a sequence with the desired
|
|
// stride.
|
|
int mulScale = (int)scaleValue->getZExtValue();
|
|
if (mulScale == 0 || (stride % mulScale) != 0)
|
|
return false;
|
|
|
|
llvm::Value **otherValue = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::BinaryOperator *eltBop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
|
|
// Give up if it's not matching the desired pattern of "all
|
|
// mul ops with the scaleOperand being a constant with the
|
|
// same value".
|
|
if (!eltBop || eltBop->getOpcode() != llvm::Instruction::Mul)
|
|
return false;
|
|
if (eltBop->getOperand(scaleOperand) != scaleValue)
|
|
return false;
|
|
|
|
otherValue[i] = eltBop->getOperand(otherOperand);
|
|
}
|
|
// Now see if the sequence of values being scaled gives us
|
|
// something with the desired stride.
|
|
return lVectorIsLinear(otherValue, vectorLength, stride / mulScale, seenPhis);
|
|
}
|
|
}
|
|
|
|
if (llvm::dyn_cast<llvm::PHINode>(v[0]) != NULL) {
|
|
int found = 0;
|
|
// If all of them have made it back to a phi node we've seen
|
|
// before, then we're good.
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
|
|
assert(phi != NULL);
|
|
if (seenPhis.find(phi) != seenPhis.end())
|
|
++found;
|
|
}
|
|
assert(found == 0 || found == vectorLength);
|
|
if (found == vectorLength)
|
|
return true;
|
|
|
|
// Otherwise record that we've seen these guys before.
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
|
|
seenPhis.insert(phi);
|
|
}
|
|
|
|
llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[0]);
|
|
unsigned int numIncoming = phi->getNumIncomingValues();
|
|
llvm::BasicBlock *bb = NULL;
|
|
bool anyFailure = false;
|
|
// Check each incoming value; if all of them are linear, then success.
|
|
for (unsigned int i = 0; i < numIncoming; ++i) {
|
|
for (int j = 0; j < vectorLength; ++j) {
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[j]);
|
|
assert(phi != NULL);
|
|
vin[j] = phi->getIncomingValue(i);
|
|
|
|
if (j == 0)
|
|
bb = phi->getIncomingBlock(i);
|
|
else
|
|
assert(bb == phi->getIncomingBlock(i));
|
|
}
|
|
|
|
if (!lVectorIsLinear(vin, vectorLength, stride, seenPhis)) {
|
|
// Don't return false immediately, since we need to remove
|
|
// the PHINode *s from v[] from seenPhis before we return.
|
|
anyFailure = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < vectorLength; ++i) {
|
|
llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
|
|
seenPhis.erase(phi);
|
|
}
|
|
|
|
return !anyFailure;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
struct GatherImpInfo {
|
|
GatherImpInfo(const char *pName, const char *lbName, const char *lmName,
|
|
int a)
|
|
: align(a) {
|
|
pseudoFunc = m->module->getFunction(pName);
|
|
loadBroadcastFunc = m->module->getFunction(lbName);
|
|
loadMaskedFunc = m->module->getFunction(lmName);
|
|
|
|
assert(pseudoFunc != NULL && loadBroadcastFunc != NULL &&
|
|
loadMaskedFunc != NULL);
|
|
}
|
|
llvm::Function *pseudoFunc;
|
|
llvm::Function *loadBroadcastFunc;
|
|
llvm::Function *loadMaskedFunc;
|
|
const int align;
|
|
};
|
|
|
|
|
|
struct ScatterImpInfo {
|
|
ScatterImpInfo(const char *pName, const char *msName,
|
|
LLVM_TYPE_CONST llvm::Type *vpt, int a)
|
|
: align(a) {
|
|
pseudoFunc = m->module->getFunction(pName);
|
|
maskedStoreFunc = m->module->getFunction(msName);
|
|
vecPtrType = vpt;
|
|
assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
|
|
}
|
|
llvm::Function *pseudoFunc;
|
|
llvm::Function *maskedStoreFunc;
|
|
LLVM_TYPE_CONST llvm::Type *vecPtrType;
|
|
const int align;
|
|
};
|
|
|
|
|
|
bool
|
|
GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
GatherImpInfo gInfo[] = {
|
|
GatherImpInfo("__pseudo_gather_base_offsets_8", "__load_and_broadcast_8",
|
|
"__load_masked_8", 1),
|
|
GatherImpInfo("__pseudo_gather_base_offsets_16", "__load_and_broadcast_16",
|
|
"__load_masked_16", 2),
|
|
GatherImpInfo("__pseudo_gather_base_offsets_32", "__load_and_broadcast_32",
|
|
"__load_masked_32", 4),
|
|
GatherImpInfo("__pseudo_gather_base_offsets_64", "__load_and_broadcast_64",
|
|
"__load_masked_64", 8)
|
|
};
|
|
ScatterImpInfo sInfo[] = {
|
|
ScatterImpInfo("__pseudo_scatter_base_offsets_8", "__pseudo_masked_store_8",
|
|
LLVMTypes::Int8VectorPointerType, 1),
|
|
ScatterImpInfo("__pseudo_scatter_base_offsets_16", "__pseudo_masked_store_16",
|
|
LLVMTypes::Int16VectorPointerType, 2),
|
|
ScatterImpInfo("__pseudo_scatter_base_offsets_32", "__pseudo_masked_store_32",
|
|
LLVMTypes::Int32VectorPointerType, 4),
|
|
ScatterImpInfo("__pseudo_scatter_base_offsets_64", "__pseudo_masked_store_64",
|
|
LLVMTypes::Int64VectorPointerType, 8)
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
|
|
restart:
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
// Iterate over all of the instructions and look for calls to
|
|
// __pseudo_*_base_offsets_* calls.
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
if (callInst == NULL)
|
|
continue;
|
|
llvm::Function *calledFunc = callInst->getCalledFunction();
|
|
GatherImpInfo *gatherInfo = NULL;
|
|
ScatterImpInfo *scatterInfo = NULL;
|
|
for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
|
|
if (gInfo[i].pseudoFunc != NULL &&
|
|
calledFunc == gInfo[i].pseudoFunc) {
|
|
gatherInfo = &gInfo[i];
|
|
break;
|
|
}
|
|
}
|
|
for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
|
|
if (sInfo[i].pseudoFunc != NULL &&
|
|
calledFunc == sInfo[i].pseudoFunc) {
|
|
scatterInfo = &sInfo[i];
|
|
break;
|
|
}
|
|
}
|
|
if (gatherInfo == NULL && scatterInfo == NULL)
|
|
continue;
|
|
|
|
SourcePos pos;
|
|
bool ok = lGetSourcePosFromMetadata(callInst, &pos);
|
|
assert(ok);
|
|
|
|
// Get the actual base pointer; note that it comes into the gather
|
|
// or scatter function bitcast to an i8 *, so we need to work back
|
|
// to get the pointer as the original type.
|
|
llvm::Value *base = callInst->getArgOperand(0);
|
|
llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(base);
|
|
if (bci)
|
|
base = bci->getOperand(0);
|
|
llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(base);
|
|
if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
|
|
base = ce->getOperand(0);
|
|
|
|
// Try to find out the offsets; the i'th element of the
|
|
// offsetElements array should be an i32 with the value of the
|
|
// offset for the i'th vector lane. This may fail; if so, just
|
|
// give up.
|
|
llvm::Value *vecValue = callInst->getArgOperand(1);
|
|
LLVM_TYPE_CONST llvm::VectorType *vt =
|
|
llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
|
|
assert(vt != NULL);
|
|
int vecLength = vt->getNumElements();
|
|
assert(vecLength == g->target.vectorWidth);
|
|
llvm::Value *offsetElements[ISPC_MAX_NVEC];
|
|
std::map<llvm::PHINode *, llvm::Value **> phiMap;
|
|
if (!lScalarizeVector(vecValue, offsetElements, vecLength, phiMap))
|
|
continue;
|
|
|
|
llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
|
|
|
|
if (lVectorValuesAllEqual(offsetElements, g->target.vectorWidth)) {
|
|
// If all the offsets are equal, then compute the single
|
|
// pointer they all represent based on the first one of them
|
|
// (arbitrarily).
|
|
llvm::Value *indices[1] = { offsetElements[0] };
|
|
llvm::Value *basei8 =
|
|
new llvm::BitCastInst(base, LLVMTypes::VoidPointerType,
|
|
"base2i8", callInst);
|
|
lCopyMetadata(basei8, callInst);
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
|
|
llvm::Value *ptr =
|
|
llvm::GetElementPtrInst::Create(basei8, arrayRef, "ptr", callInst);
|
|
#else
|
|
llvm::Value *ptr =
|
|
llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
|
|
"ptr", callInst);
|
|
#endif
|
|
lCopyMetadata(ptr, callInst);
|
|
|
|
if (gatherInfo != NULL) {
|
|
// A gather with everyone going to the same location is
|
|
// handled as a scalar load and broadcast across the lanes.
|
|
// Note that we do still have to pass the mask to the
|
|
// __load_and_broadcast_* functions, since they shouldn't
|
|
// access memory if the mask is all off (the location may
|
|
// be invalid in that case).
|
|
Debug(pos, "Transformed gather to scalar load and broadcast!");
|
|
llvm::Value *args[2] = { ptr, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, newArgArray,
|
|
"load_broadcast", (llvm::Instruction *)NULL);
|
|
#else
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, &args[0],
|
|
&args[2], "load_broadcast");
|
|
#endif
|
|
lCopyMetadata(newCall, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, newCall);
|
|
}
|
|
else {
|
|
// A scatter with everyone going to the same location is
|
|
// undefined. Issue a warning and arbitrarily let the
|
|
// first guy win.
|
|
Warning(pos, "Undefined behavior: all program instances are "
|
|
"writing to the same location!");
|
|
|
|
llvm::Value *rvalue = callInst->getArgOperand(2);
|
|
llvm::Value *first =
|
|
llvm::ExtractElementInst::Create(rvalue, LLVMInt32(0), "rvalue_first",
|
|
callInst);
|
|
lCopyMetadata(first, callInst);
|
|
ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
|
|
"ptr2rvalue_type", callInst);
|
|
lCopyMetadata(ptr, callInst);
|
|
llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false,
|
|
scatterInfo->align);
|
|
lCopyMetadata(sinst, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, sinst);
|
|
}
|
|
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
|
|
std::set<llvm::PHINode *> seenPhis;
|
|
if (lVectorIsLinear(offsetElements, g->target.vectorWidth, step, seenPhis)) {
|
|
// We have a linear sequence of memory locations being accessed
|
|
// starting with the location given by the offset from
|
|
// offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
|
|
// and 64 bit gather/scatters, respectively.)
|
|
|
|
// Get the base pointer using the first guy's offset.
|
|
llvm::Value *indices[1] = { offsetElements[0] };
|
|
llvm::Value *basei8 =
|
|
new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
|
|
lCopyMetadata(basei8, callInst);
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
|
|
llvm::Value *ptr =
|
|
llvm::GetElementPtrInst::Create(basei8, arrayRef, "ptr", callInst);
|
|
#else
|
|
llvm::Value *ptr =
|
|
llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
|
|
"ptr", callInst);
|
|
#endif
|
|
lCopyMetadata(ptr, callInst);
|
|
|
|
if (gatherInfo != NULL) {
|
|
Debug(pos, "Transformed gather to unaligned vector load!");
|
|
llvm::Value *args[2] = { ptr, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(gatherInfo->loadMaskedFunc, argArray,
|
|
"load_masked", (llvm::Instruction *)NULL);
|
|
#else
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(gatherInfo->loadMaskedFunc, &args[0],
|
|
&args[2], "load_masked");
|
|
#endif
|
|
lCopyMetadata(newCall, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, newCall);
|
|
}
|
|
else {
|
|
Debug(pos, "Transformed scatter to unaligned vector store!");
|
|
llvm::Value *rvalue = callInst->getArgOperand(2);
|
|
ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast",
|
|
callInst);
|
|
|
|
llvm::Value *args[3] = { ptr, rvalue, mask };
|
|
#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
|
|
llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(scatterInfo->maskedStoreFunc, argArray,
|
|
"", (llvm::Instruction *)NULL);
|
|
#else
|
|
llvm::Instruction *newCall =
|
|
llvm::CallInst::Create(scatterInfo->maskedStoreFunc,
|
|
&args[0], &args[3], "");
|
|
#endif
|
|
lCopyMetadata(newCall, callInst);
|
|
llvm::ReplaceInstWithInst(callInst, newCall);
|
|
}
|
|
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
#if 0
|
|
lPrintVector("scatter/gather no love: flattened", offsetElements);
|
|
bb.dump();
|
|
#endif
|
|
}
|
|
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateGatherScatterImprovementsPass() {
|
|
return new GSImprovementsPass;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// LowerGSPass
|
|
|
|
/** For any gathers and scatters remaining after the GSImprovementsPass
|
|
runs, we need to turn them into actual native gathers and scatters.
|
|
This task is handled by the LowerGSPass here.
|
|
*/
|
|
class LowerGSPass : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
LowerGSPass() : BasicBlockPass(ID) { }
|
|
|
|
const char *getPassName() const { return "Gather/Scatter Improvements"; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
};
|
|
|
|
|
|
char LowerGSPass::ID = 0;
|
|
|
|
llvm::RegisterPass<LowerGSPass> lgs("lower-gs",
|
|
"Lower Gather/Scatter Pass");
|
|
|
|
struct LowerGSInfo {
|
|
LowerGSInfo(const char *pName, const char *aName, bool ig)
|
|
: isGather(ig) {
|
|
pseudoFunc = m->module->getFunction(pName);
|
|
actualFunc = m->module->getFunction(aName);
|
|
assert(pseudoFunc != NULL && actualFunc != NULL);
|
|
}
|
|
llvm::Function *pseudoFunc;
|
|
llvm::Function *actualFunc;
|
|
const bool isGather;
|
|
};
|
|
|
|
|
|
bool
|
|
LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
LowerGSInfo lgsInfo[] = {
|
|
LowerGSInfo("__pseudo_gather_base_offsets_8", "__gather_base_offsets_i8", true),
|
|
LowerGSInfo("__pseudo_gather_base_offsets_16", "__gather_base_offsets_i16", true),
|
|
LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
|
|
LowerGSInfo("__pseudo_gather_base_offsets_64", "__gather_base_offsets_i64", true),
|
|
LowerGSInfo("__pseudo_scatter_base_offsets_8", "__scatter_base_offsets_i8", false),
|
|
LowerGSInfo("__pseudo_scatter_base_offsets_16", "__scatter_base_offsets_i16", false),
|
|
LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false),
|
|
LowerGSInfo("__pseudo_scatter_base_offsets_64", "__scatter_base_offsets_i64", false)
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
|
// Loop over the instructions and find calls to the
|
|
// __pseudo_*_base_offsets_* functions.
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
|
|
if (callInst == NULL)
|
|
continue;
|
|
llvm::Function *calledFunc = callInst->getCalledFunction();
|
|
LowerGSInfo *info = NULL;
|
|
for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
|
|
if (lgsInfo[i].pseudoFunc != NULL &&
|
|
calledFunc == lgsInfo[i].pseudoFunc) {
|
|
info = &lgsInfo[i];
|
|
break;
|
|
}
|
|
}
|
|
if (info == NULL)
|
|
continue;
|
|
|
|
// Get the source position from the metadata attached to the call
|
|
// instruction so that we can issue PerformanceWarning()s below.
|
|
SourcePos pos;
|
|
bool ok = lGetSourcePosFromMetadata(callInst, &pos);
|
|
assert(ok);
|
|
|
|
callInst->setCalledFunction(info->actualFunc);
|
|
if (info->isGather)
|
|
PerformanceWarning(pos, "Gather required to compute value in expression.");
|
|
else
|
|
PerformanceWarning(pos, "Scatter required for storing value.");
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateLowerGatherScatterPass() {
|
|
return new LowerGSPass;
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// IsCompileTimeConstantPass
|
|
|
|
/** LLVM IR implementations of target-specific functions may include calls
|
|
to the functions "bool __is_compile_time_constant_*(...)"; these allow
|
|
them to have specialied code paths for where the corresponding value is
|
|
known at compile time. For masks, for example, this allows them to not
|
|
incur the cost of a MOVMSK call at runtime to compute its value in
|
|
cases where the mask value isn't known until runtime.
|
|
|
|
This pass resolves these calls into either 'true' or 'false' values so
|
|
that later optimization passes can operate with these as constants.
|
|
|
|
See stdlib.m4 for a number of uses of this idiom.
|
|
*/
|
|
|
|
class IsCompileTimeConstantPass : public llvm::BasicBlockPass {
|
|
public:
|
|
static char ID;
|
|
IsCompileTimeConstantPass(bool last = false) : BasicBlockPass(ID) {
|
|
isLastTry = last;
|
|
}
|
|
|
|
const char *getPassName() const { return "Resolve \"is compile time constant\""; }
|
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
|
|
|
bool isLastTry;
|
|
};
|
|
|
|
char IsCompileTimeConstantPass::ID = 0;
|
|
|
|
llvm::RegisterPass<IsCompileTimeConstantPass>
|
|
ctcrp("compile-time-constant", "Compile-Time Constant Resolve Pass");
|
|
|
|
bool
|
|
IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|
llvm::Function *funcs[] = {
|
|
m->module->getFunction("__is_compile_time_constant_mask"),
|
|
m->module->getFunction("__is_compile_time_constant_uniform_int32"),
|
|
m->module->getFunction("__is_compile_time_constant_varying_int32")
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
restart:
|
|
for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
|
|
// Iterate through the instructions looking for calls to the
|
|
// __is_compile_time_constant_*() functions
|
|
llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
|
|
if (callInst == NULL)
|
|
continue;
|
|
|
|
int j;
|
|
int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
|
|
for (j = 0; j < nFuncs; ++j) {
|
|
if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
|
|
break;
|
|
}
|
|
if (j == nFuncs)
|
|
// not a __is_compile_time_constant_* function
|
|
continue;
|
|
|
|
// This optimization pass can be disabled with the (poorly named)
|
|
// disableGatherScatterFlattening option.
|
|
if (g->opt.disableGatherScatterFlattening) {
|
|
llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
// Is it a constant? Bingo, turn the call's value into a constant
|
|
// true value.
|
|
llvm::Value *operand = callInst->getArgOperand(0);
|
|
if (llvm::isa<llvm::Constant>(operand)) {
|
|
llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
|
|
// This pass runs multiple times during optimization. Up until the
|
|
// very last time, it only replaces the call with a 'true' if the
|
|
// value is known to be constant and otherwise leaves the call
|
|
// alone, in case further optimization passes can help resolve its
|
|
// value. The last time through, it eventually has to give up, and
|
|
// replaces any remaining ones with 'false' constants.
|
|
if (isLastTry) {
|
|
llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
|
|
modifiedAny = true;
|
|
goto restart;
|
|
}
|
|
}
|
|
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateIsCompileTimeConstantPass(bool isLastTry) {
|
|
return new IsCompileTimeConstantPass(isLastTry);
|
|
}
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// MakeInternalFuncsStaticPass
|
|
|
|
/** There are a number of target-specific functions that we use during
|
|
these optimization passes. By the time we are done with optimization,
|
|
any uses of these should be inlined and no calls to these functions
|
|
should remain. This pass marks all of these functions as having
|
|
private linkage so that subsequent passes can eliminate them as dead
|
|
code, thus cleaning up the final code output by the compiler. We can't
|
|
just declare these as static from the start, however, since then they
|
|
end up being eliminated as dead code during early optimization passes
|
|
even though we may need to generate calls to them during later
|
|
optimization passes.
|
|
*/
|
|
class MakeInternalFuncsStaticPass : public llvm::ModulePass {
|
|
public:
|
|
static char ID;
|
|
MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {
|
|
}
|
|
|
|
const char *getPassName() const { return "Make internal funcs \"static\""; }
|
|
bool runOnModule(llvm::Module &m);
|
|
};
|
|
|
|
char MakeInternalFuncsStaticPass::ID = 0;
|
|
|
|
llvm::RegisterPass<MakeInternalFuncsStaticPass>
|
|
mifsp("make-internal-funcs-static", "Make Internal Funcs Static Pass");
|
|
|
|
|
|
bool
|
|
MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
|
|
const char *names[] = {
|
|
"__fast_masked_vload",
|
|
"__gather_base_offsets_i8", "__gather_base_offsets_i16",
|
|
"__gather_base_offsets_i32", "__gather_base_offsets_i64",
|
|
"__gather_elt_i8", "__gather_elt_i16",
|
|
"__gather_elt_i32", "__gather_elt_i64",
|
|
"__load_and_broadcast_8", "__load_and_broadcast_16",
|
|
"__load_and_broadcast_32", "__load_and_broadcast_64",
|
|
"__load_masked_8", "__load_masked_16",
|
|
"__load_masked_32", "__load_masked_64",
|
|
"__masked_store_8", "__masked_store_16",
|
|
"__masked_store_32", "__masked_store_64",
|
|
"__masked_store_blend_8", "__masked_store_blend_16",
|
|
"__masked_store_blend_32", "__masked_store_blend_64",
|
|
"__scatter_base_offsets_i8", "__scatter_base_offsets_i16",
|
|
"__scatter_base_offsets_i32", "__scatter_base_offsets_i64",
|
|
"__scatter_elt_i8", "__scatter_elt_i16",
|
|
"__scatter_elt_i32", "__scatter_elt_i64",
|
|
};
|
|
|
|
bool modifiedAny = false;
|
|
int count = sizeof(names) / sizeof(names[0]);
|
|
for (int i = 0; i < count; ++i) {
|
|
llvm::Function *f = m->module->getFunction(names[i]);
|
|
if (f != NULL) {
|
|
f->setLinkage(llvm::GlobalValue::InternalLinkage);
|
|
modifiedAny = true;
|
|
}
|
|
}
|
|
|
|
return modifiedAny;
|
|
}
|
|
|
|
|
|
static llvm::Pass *
|
|
CreateMakeInternalFuncsStaticPass() {
|
|
return new MakeInternalFuncsStaticPass;
|
|
}
|