Fix performance regression introduced in be0c77d556

Effectively, the patterns that detected when given a gather or
scatter in base+offsets form, the offsets were actually a multiple
of 2/4/8, were no longer working.

This change not only fixes this, but also expands the set of
patterns that are matched by this.  For example, given offsets of
the form 4*v1 + 16*v2, it identifies a scale of 4 and new offsets
of v1 + 4*v2.

This fix makes the volume renderer run 1.19x faster, and noise 1.54x
faster.
This commit is contained in:
Matt Pharr
2012-01-19 17:54:21 -08:00
parent 2fb59c90cf
commit 4388338dad

46
opt.cpp
View File

@@ -1174,21 +1174,47 @@ lExtractOffsetVector248Scale(llvm::Value **vec) {
return scale;
}
// If we don't have a multiply, then just return
// If we don't have a binary operator, then just give up
llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
if (bop == NULL || bop->getOpcode() != llvm::Instruction::Mul)
if (bop == NULL)
return LLVMInt32(1);
// Check each operand for being one of the scale factors we care about.
llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
int splat;
if (lIs248Splat(op0, &splat)) {
*vec = op1;
return LLVMInt32(splat);
if (bop->getOpcode() == llvm::Instruction::Add) {
if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
*vec = op1;
return lExtractOffsetVector248Scale(vec);
}
else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
*vec = op0;
return lExtractOffsetVector248Scale(vec);
}
else {
llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
if (s0 == s1) {
*vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
op0, op1, "new_add", bop);
return s0;
}
else
return LLVMInt32(1);
}
}
else if (lIs248Splat(op1, &splat)) {
*vec = op0;
return LLVMInt32(splat);
else if (bop->getOpcode() == llvm::Instruction::Mul) {
// Check each operand for being one of the scale factors we care about.
int splat;
if (lIs248Splat(op0, &splat)) {
*vec = op1;
return LLVMInt32(splat);
}
else if (lIs248Splat(op1, &splat)) {
*vec = op0;
return LLVMInt32(splat);
}
else
return LLVMInt32(1);
}
else
return LLVMInt32(1);