diff --git a/opt.cpp b/opt.cpp index 53def6f2..1ebebd6c 100644 --- a/opt.cpp +++ b/opt.cpp @@ -44,6 +44,8 @@ #include "llvmutil.h" #include +#include +#include #include #include @@ -1558,7 +1560,7 @@ static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC]) */ static bool lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, - int vectorLength) { + int vectorLength, std::map &phiMap) { // First initialize the values of scalarizedVector[] to NULL. for (int i = 0; i < vectorLength; ++i) scalarizedVector[i] = NULL; @@ -1599,8 +1601,8 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, llvm::Value **v1 = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); - if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength) || - !lScalarizeVector(bo->getOperand(1), v1, vectorLength)) + if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength, phiMap) || + !lScalarizeVector(bo->getOperand(1), v1, vectorLength, phiMap)) return false; for (int i = 0; i < vectorLength; ++i) { @@ -1652,7 +1654,7 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, llvm::Value **scalarizedTarget = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget, - vectorLength)) + vectorLength, phiMap)) return false; LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy(); @@ -1694,8 +1696,8 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *)); llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *)); - if (!lScalarizeVector(svi->getOperand(0), v0, n0) || - !lScalarizeVector(svi->getOperand(1), v1, n1)) + if (!lScalarizeVector(svi->getOperand(0), v0, n0, phiMap) || + !lScalarizeVector(svi->getOperand(1), v1, n1, phiMap)) return false; llvm::ConstantAggregateZero *caz = @@ -1777,6 +1779,55 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, return true; } + llvm::PHINode *phi = llvm::dyn_cast(vec); + if (phi != NULL) { + // If we've seen this phi node during an earlier recursive step, + // then don't re-scalarize it, but return the already allocated + // pointer values. (This both avoids an infinite loop and ensures + // that we point back to ourself properly for cases where some of + // the incoming values end up being derived from the phi node... + if (phiMap.find(phi) != phiMap.end()) { + llvm::Value **v = phiMap[phi]; + for (int i = 0; i < vectorLength; ++i) + scalarizedVector[i] = v[i]; + return true; + } + + LLVM_TYPE_CONST llvm::VectorType *vecType = + llvm::dyn_cast(phi->getType()); + LLVM_TYPE_CONST llvm::Type *eltType = vecType->getElementType(); + assert(vecType != NULL); + unsigned int numIncoming = phi->getNumIncomingValues(); + + // First allocate all of the scalarized phi nodes, so that we can + // get them into the map<> before making recursive calls to + // lScalarizeVector. + for (int i = 0; i < vectorLength; ++i) { + scalarizedVector[i] = + llvm::PHINode::Create(eltType, numIncoming, "phi", phi); + lCopyMetadata(scalarizedVector[i], phi); + } + + phiMap[phi] = scalarizedVector; + + llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + // Now, for each incoming value, scalarize it recursively and then + // update the scalarized phi node for this element. + for (unsigned int i = 0; i < numIncoming; ++i) { + if (!lScalarizeVector(phi->getIncomingValue(i), vin, vectorLength, phiMap)) + return false; + llvm::BasicBlock *bbin = phi->getIncomingBlock(i); + + for (int j = 0; j < vectorLength; ++j) { + llvm::PHINode *phi = (llvm::PHINode *)scalarizedVector[j]; + phi->addIncoming(vin[j], bbin); + } + } + + phiMap.erase(phiMap.find(phi)); + return true; + } + #if 0 fprintf(stderr, "flatten vector fixme\n"); vec->dump(); @@ -1798,7 +1849,9 @@ lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector, won't get all of them. */ static bool -lValuesAreEqual(llvm::Value *v0, llvm::Value *v1) { +lValuesAreEqual(llvm::Value *v0, llvm::Value *v1, + std::vector &seenPhi0, + std::vector &seenPhi1) { // Thanks to the fact that LLVM hashes and returns the same pointer for // constants (of all sorts, even constant expressions), this first test // actually catches a lot of cases. LLVM's SSA form also helps a lot @@ -1806,13 +1859,49 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1) { if (v0 == v1) return true; + assert(seenPhi0.size() == seenPhi1.size()); + for (unsigned int i = 0; i < seenPhi0.size(); ++i) + if (v0 == seenPhi0[i] && v1 == seenPhi1[i]) + return true; + llvm::BinaryOperator *bo0 = llvm::dyn_cast(v0); llvm::BinaryOperator *bo1 = llvm::dyn_cast(v1); - if (bo0 && bo1) { + if (bo0 != NULL && bo1 != NULL) { if (bo0->getOpcode() != bo1->getOpcode()) return false; - return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0)) && - lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1))); + return (lValuesAreEqual(bo0->getOperand(0), bo1->getOperand(0), + seenPhi0, seenPhi1) && + lValuesAreEqual(bo0->getOperand(1), bo1->getOperand(1), + seenPhi0, seenPhi1)); + } + + llvm::PHINode *phi0 = llvm::dyn_cast(v0); + llvm::PHINode *phi1 = llvm::dyn_cast(v1); + if (phi0 != NULL && phi1 != NULL) { + if (phi0->getNumIncomingValues() != + phi1->getNumIncomingValues()) + return false; + + seenPhi0.push_back(phi0); + seenPhi1.push_back(phi1); + + unsigned int numIncoming = phi0->getNumIncomingValues(); + // Check all of the incoming values: if all of them are all equal, + // then we're good. + bool anyFailure = false; + for (unsigned int i = 0; i < numIncoming; ++i) { + assert(phi0->getIncomingBlock(i) == phi1->getIncomingBlock(i)); + if (!lValuesAreEqual(phi0->getIncomingValue(i), + phi1->getIncomingValue(i), seenPhi0, seenPhi1)) { + anyFailure = true; + break; + } + } + + seenPhi0.pop_back(); + seenPhi1.pop_back(); + + return !anyFailure; } return false; @@ -1824,9 +1913,10 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1) { arrays where the values are actually all equal. */ static bool -lVectorValuesAllEqual(llvm::Value *v[ISPC_MAX_NVEC]) { - for (int i = 0; i < g->target.vectorWidth-1; ++i) - if (!lValuesAreEqual(v[i], v[i+1])) +lVectorValuesAllEqual(llvm::Value **v, int vectorLength) { + std::vector seenPhi0, seenPhi1; + for (int i = 0; i < vectorLength-1; ++i) + if (!lValuesAreEqual(v[i], v[i+1], seenPhi0, seenPhi1)) return false; return true; } @@ -1838,7 +1928,7 @@ lVectorValuesAllEqual(llvm::Value *v[ISPC_MAX_NVEC]) { elements. */ static bool -lVectorIsLinearConstantInts(llvm::Value *v[ISPC_MAX_NVEC], int stride) { +lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) { llvm::ConstantInt *prev = llvm::dyn_cast(v[0]); if (!prev) return false; @@ -1847,7 +1937,7 @@ lVectorIsLinearConstantInts(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // For each element in the array, see if it is both a ConstantInt and // if the difference between it and the value of the previous element // is stride. If not, fail. - for (int i = 1; i < g->target.vectorWidth; ++i) { + for (int i = 1; i < vectorLength; ++i) { llvm::ConstantInt *next = llvm::dyn_cast(v[i]); if (!next) return false; @@ -1874,14 +1964,15 @@ lVectorIsLinearConstantInts(llvm::Value *v[ISPC_MAX_NVEC], int stride) { back an array of all zeros? */ static bool -lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { +lVectorIsLinear(llvm::Value **v, int vectorLength, int stride, + std::set &seenPhis) { #if 0 lPrintVector("called lVectorIsLinear", v); #endif // First try the easy case: if the values are all just constant // integers and have the expected stride between them, then we're done. - if (lVectorIsLinearConstantInts(v, stride)) + if (lVectorIsLinearConstantInts(v, vectorLength, stride)) return true; // ConstantExprs need a bit of deconstruction to figure out @@ -1893,7 +1984,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // some sort. If not, give up. // FIXME: are we potentially missing cases here, e.g. a mixture of // ConstantExprs and ConstantInts? - for (int i = 0; i < g->target.vectorWidth; ++i) { + for (int i = 0; i < vectorLength; ++i) { if (!llvm::isa(v[i])) return false; } @@ -1904,7 +1995,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // "foo", so we need to deal with cases where element 0 is "foo", // element 1 is add(4, foo), etc... bool anyAdds = false, allAdds = true; - for (int i = 0; i < g->target.vectorWidth; ++i) { + for (int i = 0; i < vectorLength; ++i) { llvm::ConstantExpr *ce = llvm::dyn_cast(v[i]); if (ce->getOpcode() == llvm::Instruction::Add || ce->getOpcode() == llvm::Instruction::Sub) @@ -1924,8 +2015,9 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // are an add with 0 as the other operand. // 2. Extract the ConstInt operand of the add into the intBit[] // array and put the other operand in the otherBit[] array. - llvm::Value *intBit[ISPC_MAX_NVEC], *otherBit[ISPC_MAX_NVEC]; - for (int i = 0; i < g->target.vectorWidth; ++i) { + llvm::Value **intBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + llvm::Value **otherBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + for (int i = 0; i < vectorLength; ++i) { llvm::ConstantExpr *ce = llvm::dyn_cast(v[i]); if (ce->getOpcode() == llvm::Instruction::Add) { // The ConstantInt may be either of the two operands of @@ -1962,8 +2054,8 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // we're adding constant values with the desired stride to the // same base value. If so, we know we have a linear set of // locations. - return (lVectorIsLinear(intBit, stride) && - lVectorValuesAllEqual(otherBit)); + return (lVectorIsLinear(intBit, vectorLength, stride, seenPhis) && + lVectorValuesAllEqual(otherBit, vectorLength)); } // If this ever hits, the assertion can just be commented out and @@ -1972,7 +2064,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // up and possibly hurting performance of the final code. FATAL("Unexpected case with a ConstantExpr in lVectorIsLinear"); #if 0 - for (int i = 0; i < g->target.vectorWidth; ++i) + for (int i = 0; i < vectorLength; ++i) v[i]->dump(); FATAL("FIXME"); #endif @@ -1988,7 +2080,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // and then everything after element 0 being a binary operator with an add. // That won't get caught by this case?? bool anyAdd = false, anySub = false; - for (int i = 0; i < g->target.vectorWidth; ++i) { + for (int i = 0; i < vectorLength; ++i) { llvm::BinaryOperator *bopi = llvm::dyn_cast(v[i]); if (bopi) { if (bopi->getOpcode() == llvm::Instruction::Add) @@ -2010,11 +2102,13 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // two operands matches the one we started with? That would be // more robust to switching the ordering of operands, in case // that ever happens... + llvm::Value **addSubOperandValues = + (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + for (int operand = 0; operand <= 1; ++operand) { - llvm::Value *addSubOperandValues[ISPC_MAX_NVEC]; // Go through the vector elements and grab the operand'th // one if this is an add or the v - for (int i = 0; i < g->target.vectorWidth; ++i) { + for (int i = 0; i < vectorLength; ++i) { llvm::BinaryOperator *bop = llvm::dyn_cast(v[i]); if (bop->getOpcode() == llvm::Instruction::Add || bop->getOpcode() == llvm::Instruction::Sub) @@ -2026,7 +2120,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { addSubOperandValues[i] = v[i]; } - if (lVectorValuesAllEqual(addSubOperandValues) && + if (lVectorValuesAllEqual(addSubOperandValues, vectorLength) && (anyAdd || operand == 1)) { // If this operand's values are all equal, then the // overall result is an ascending linear sequence if @@ -2043,7 +2137,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { // future point we could generate code for that as a // vector load + shuffle. int otherOperand = operand ^ 1; - for (int i = 0; i < g->target.vectorWidth; ++i) { + for (int i = 0; i < vectorLength; ++i) { llvm::BinaryOperator *bop = llvm::dyn_cast(v[i]); if (bop->getOpcode() == llvm::Instruction::Add || bop->getOpcode() == llvm::Instruction::Sub) @@ -2051,7 +2145,7 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { else addSubOperandValues[i] = LLVMInt32(0); } - return lVectorIsLinear(addSubOperandValues, stride); + return lVectorIsLinear(addSubOperandValues, vectorLength, stride, seenPhis); } } } @@ -2094,8 +2188,8 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { if ((stride % mulScale) != 0) return false; - llvm::Value *otherValue[ISPC_MAX_NVEC]; - for (int i = 0; i < g->target.vectorWidth; ++i) { + llvm::Value **otherValue = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + for (int i = 0; i < vectorLength; ++i) { llvm::BinaryOperator *eltBop = llvm::dyn_cast(v[i]); // Give up if it's not matching the desired pattern of "all // mul ops with the scaleOperand being a constant with the @@ -2109,10 +2203,64 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) { } // Now see if the sequence of values being scaled gives us // something with the desired stride. - return lVectorIsLinear(otherValue, stride / mulScale); + return lVectorIsLinear(otherValue, vectorLength, stride / mulScale, seenPhis); } } + if (llvm::dyn_cast(v[0]) != NULL) { + int found = 0; + // If all of them have made it back to a phi node we've seen + // before, then we're good. + for (int i = 0; i < vectorLength; ++i) { + llvm::PHINode *phi = llvm::dyn_cast(v[i]); + assert(phi != NULL); + if (seenPhis.find(phi) != seenPhis.end()) + ++found; + } + assert(found == 0 || found == vectorLength); + if (found == vectorLength) + return true; + + // Otherwise record that we've seen these guys before. + for (int i = 0; i < vectorLength; ++i) { + llvm::PHINode *phi = llvm::dyn_cast(v[i]); + seenPhis.insert(phi); + } + + llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *)); + llvm::PHINode *phi = llvm::dyn_cast(v[0]); + unsigned int numIncoming = phi->getNumIncomingValues(); + llvm::BasicBlock *bb = NULL; + bool anyFailure = false; + // Check each incoming value; if all of them are linear, then success. + for (unsigned int i = 0; i < numIncoming; ++i) { + for (int j = 0; j < vectorLength; ++j) { + llvm::PHINode *phi = llvm::dyn_cast(v[j]); + assert(phi != NULL); + vin[j] = phi->getIncomingValue(i); + + if (j == 0) + bb = phi->getIncomingBlock(i); + else + assert(bb == phi->getIncomingBlock(i)); + } + + if (!lVectorIsLinear(vin, vectorLength, stride, seenPhis)) { + // Don't return false immediately, since we need to remove + // the PHINode *s from v[] from seenPhis before we return. + anyFailure = true; + break; + } + } + + for (int i = 0; i < vectorLength; ++i) { + llvm::PHINode *phi = llvm::dyn_cast(v[i]); + seenPhis.erase(phi); + } + + return !anyFailure; + } + return false; } @@ -2229,12 +2377,13 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { int vecLength = vt->getNumElements(); assert(vecLength == g->target.vectorWidth); llvm::Value *offsetElements[ISPC_MAX_NVEC]; - if (!lScalarizeVector(vecValue, offsetElements, vecLength)) + std::map phiMap; + if (!lScalarizeVector(vecValue, offsetElements, vecLength, phiMap)) continue; llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3); - if (lVectorValuesAllEqual(offsetElements)) { + if (lVectorValuesAllEqual(offsetElements, g->target.vectorWidth)) { // If all the offsets are equal, then compute the single // pointer they all represent based on the first one of them // (arbitrarily). @@ -2302,7 +2451,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) { } int step = gatherInfo ? gatherInfo->align : scatterInfo->align; - if (lVectorIsLinear(offsetElements, step)) { + std::set seenPhis; + if (lVectorIsLinear(offsetElements, g->target.vectorWidth, step, seenPhis)) { // We have a linear sequence of memory locations being accessed // starting with the location given by the offset from // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit diff --git a/tests/phi-opts-1.ispc b/tests/phi-opts-1.ispc new file mode 100644 index 00000000..d4265681 --- /dev/null +++ b/tests/phi-opts-1.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float sum = 0; + for (int i = 0; i < 16; i += programCount) + sum += aFOO[i+programIndex]; + RET[programIndex] = reduce_add(sum); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 136; +} diff --git a/tests/phi-opts-2.ispc b/tests/phi-opts-2.ispc new file mode 100644 index 00000000..1ee8dcb7 --- /dev/null +++ b/tests/phi-opts-2.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float sum = 0; + for (int i = programIndex; i < 16; i += programCount) + sum += aFOO[i]; + RET[programIndex] = reduce_add(sum); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 136; +} diff --git a/tests/phi-opts-3.ispc b/tests/phi-opts-3.ispc new file mode 100644 index 00000000..fe3f7e5e --- /dev/null +++ b/tests/phi-opts-3.ispc @@ -0,0 +1,13 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float sum = 0; + for (int i = 0; i < 16; ++i) + sum += aFOO[i]; + RET[programIndex] = extract(sum, 0); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 136; +} diff --git a/tests/phi-opts-4.ispc b/tests/phi-opts-4.ispc new file mode 100644 index 00000000..5ef7f1b4 --- /dev/null +++ b/tests/phi-opts-4.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float sum = 0; + // Obscfucated way to make delta all one, but make that unclear to the + // compiler + int delta = aFOO[min(0., aFOO[programIndex])]; + + // The optimization shouldn't apply for this, since delta isn't known + // to be all equal + for (int i = 0; i < 16; i += delta) + sum += aFOO[i]; + RET[programIndex] = extract(sum, 0); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 136; +}