From 48644813d4d3b34a1da8360844f1ac67bff0c964 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 14 Nov 2013 11:30:22 +0100 Subject: [PATCH] stmt.cpp forking on foreach --- stmt.cpp | 1015 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 761 insertions(+), 254 deletions(-) diff --git a/stmt.cpp b/stmt.cpp index 4ec63d35..05209e14 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -1273,7 +1273,10 @@ static llvm::Value * lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, llvm::Value *uniformCounterPtr, llvm::Value *varyingCounterPtr, - const std::vector &spans) { + const std::vector &spans) +{ + if (!g->target->isPTX()) + { // Smear the uniform counter value out to be varying llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); llvm::Value *smearCounter = ctx->BroadcastValue( @@ -1306,6 +1309,47 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, LLVMInt32Vector(delta), "iter_val"); ctx->StoreInst(varyingCounter, varyingCounterPtr); return varyingCounter; + } + else /* isPTX() == true */ + { + // Smear the uniform counter value out to be varying + llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); + llvm::Value *smearCounter = ctx->BroadcastValue( + counter, LLVMTypes::Int32VectorType, "smear_counter"); + + // Figure out the offsets; this is a little bit tricky. As an example, + // consider a 2D tiled foreach loop, where we're running 8-wide and + // where the inner dimension has a stride of 4 and the outer dimension + // has a stride of 2. For the inner dimension, we want the offsets + // (0,1,2,3,0,1,2,3), and for the outer dimension we want + // (0,0,0,0,1,1,1,1). + int32_t delta[ISPC_MAX_NVEC]; + const int vecWidth = 32; + for (int i = 0; i < vecWidth; ++i) { + int d = i; + // First, account for the effect of any dimensions at deeper + // nesting levels than the current one. + int prevDimSpanCount = 1; + for (int j = dim; j < nDims-1; ++j) + prevDimSpanCount *= spans[j+1]; + d /= prevDimSpanCount; + + // And now with what's left, figure out our own offset + delta[i] = d % spans[dim]; + } + + llvm::VectorType *LLVMTypes::Int32VectorSIMT = llvm::VectorType::get(LLVMTypes::Int32Type, 32); + llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int32Type, 32); + + + // Add the deltas to compute the varying counter values; store the + // result to memory and then return it directly as well. + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + LLVMInt32Vector(delta), "iter_val"); + ctx->StoreInst(varyingCounter, varyingCounterPtr); + return varyingCounter; + } } @@ -1368,65 +1412,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL) return; - llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); - llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); - llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); + if (!g->target->isPTX()) + { + llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); + llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); + llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); - llvm::Value *oldMask = ctx->GetInternalMask(); - llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); + llvm::Value *oldMask = ctx->GetInternalMask(); + llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); - ctx->SetDebugPos(pos); - ctx->StartScope(); + ctx->SetDebugPos(pos); + ctx->StartScope(); - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetFunctionMask(LLVMMaskAllOn); + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetFunctionMask(LLVMMaskAllOn); - // This should be caught during typechecking - AssertPos(pos, startExprs.size() == dimVariables.size() && - endExprs.size() == dimVariables.size()); - int nDims = (int)dimVariables.size(); + // This should be caught during typechecking + AssertPos(pos, startExprs.size() == dimVariables.size() && + endExprs.size() == dimVariables.size()); + int nDims = (int)dimVariables.size(); - /////////////////////////////////////////////////////////////////////// - // Setup: compute the number of items we have to work on in each - // dimension and a number of derived values. - std::vector bbReset, bbStep, bbTest; - std::vector startVals, endVals, uniformCounterPtrs; - std::vector nExtras, alignedEnd, extrasMaskPtrs; + /////////////////////////////////////////////////////////////////////// + // Setup: compute the number of items we have to work on in each + // dimension and a number of derived values. + std::vector bbReset, bbStep, bbTest; + std::vector startVals, endVals, uniformCounterPtrs; + std::vector nExtras, alignedEnd, extrasMaskPtrs; - std::vector span(nDims, 0); - lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]); + std::vector span(nDims, 0); + lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]); - for (int i = 0; i < nDims; ++i) { + for (int i = 0; i < nDims; ++i) { // Basic blocks that we'll fill in later with the looping logic for // this dimension. bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); if (i < nDims-1) - // stepping for the innermost dimension is handled specially - bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); + // stepping for the innermost dimension is handled specially + bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); // Start and end value for this loop dimension llvm::Value *sv = startExprs[i]->GetValue(ctx); llvm::Value *ev = endExprs[i]->GetValue(ctx); if (sv == NULL || ev == NULL) - return; + return; startVals.push_back(sv); endVals.push_back(ev); // nItems = endVal - startVal llvm::Value *nItems = - ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); + ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); // nExtras = nItems % (span for this dimension) // This gives us the number of extra elements we need to deal with // at the end of the loop for this dimension that don't fit cleanly // into a vector width. nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, - LLVMInt32(span[i]), "nextras")); + LLVMInt32(span[i]), "nextras")); // alignedEnd = endVal - nExtras alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, - nExtras[i], "aligned_end")); + nExtras[i], "aligned_end")); /////////////////////////////////////////////////////////////////////// // Each dimension has a loop counter that is a uniform value that @@ -1434,15 +1480,15 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // dimension. Its value is only used internally here for looping // logic and isn't directly available in the user's program code. uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, - "counter")); + "counter")); ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); // There is also a varying variable that holds the set of index // values for each dimension in the current loop iteration; this is // the value that is program-visible. dimVariables[i]->storagePtr = - ctx->AllocaInst(LLVMTypes::Int32VectorType, - dimVariables[i]->name.c_str()); + ctx->AllocaInst(LLVMTypes::Int32VectorType, + dimVariables[i]->name.c_str()); dimVariables[i]->parentFunction = ctx->GetFunction(); ctx->EmitVariableDebugInfo(dimVariables[i]); @@ -1452,71 +1498,71 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // out-of-bounds offsets.) extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - } + } - ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); + ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); - // On to the outermost loop's test - ctx->BranchInst(bbTest[0]); + // On to the outermost loop's test + ctx->BranchInst(bbTest[0]); - /////////////////////////////////////////////////////////////////////////// - // foreach_reset: this code runs when we need to reset the counter for - // a given dimension in preparation for running through its loop again, - // after the enclosing level advances its counter. - for (int i = 0; i < nDims; ++i) { + /////////////////////////////////////////////////////////////////////////// + // foreach_reset: this code runs when we need to reset the counter for + // a given dimension in preparation for running through its loop again, + // after the enclosing level advances its counter. + for (int i = 0; i < nDims; ++i) { ctx->SetCurrentBasicBlock(bbReset[i]); if (i == 0) - ctx->BranchInst(bbExit); + ctx->BranchInst(bbExit); else { - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - ctx->BranchInst(bbStep[i-1]); + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + ctx->BranchInst(bbStep[i-1]); } - } + } - /////////////////////////////////////////////////////////////////////////// - // foreach_step: increment the uniform counter by the vector width. - // Note that we don't increment the varying counter here as well but - // just generate its value when we need it in the loop body. Don't do - // this for the innermost dimension, which has a more complex stepping - // structure.. - for (int i = 0; i < nDims-1; ++i) { + /////////////////////////////////////////////////////////////////////////// + // foreach_step: increment the uniform counter by the vector width. + // Note that we don't increment the varying counter here as well but + // just generate its value when we need it in the loop body. Don't do + // this for the innermost dimension, which has a more complex stepping + // structure.. + for (int i = 0; i < nDims-1; ++i) { ctx->SetCurrentBasicBlock(bbStep[i]); llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[i]), "new_counter"); + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[i]), "new_counter"); ctx->StoreInst(newCounter, uniformCounterPtrs[i]); ctx->BranchInst(bbTest[i]); - } + } - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for all dimensions other than the innermost...) - std::vector inExtras; - for (int i = 0; i < nDims-1; ++i) { + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for all dimensions other than the innermost...) + std::vector inExtras; + for (int i = 0; i < nDims-1; ++i) { ctx->SetCurrentBasicBlock(bbTest[i]); llvm::Value *haveExtras = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, - endVals[i], alignedEnd[i], "have_extras"); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, + endVals[i], alignedEnd[i], "have_extras"); llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); llvm::Value *atAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - counter, alignedEnd[i], "at_aligned_end"); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + counter, alignedEnd[i], "at_aligned_end"); llvm::Value *inEx = - ctx->BinaryOperator(llvm::Instruction::And, haveExtras, - atAlignedEnd, "in_extras"); + ctx->BinaryOperator(llvm::Instruction::And, haveExtras, + atAlignedEnd, "in_extras"); if (i == 0) - inExtras.push_back(inEx); + inExtras.push_back(inEx); else - inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, - inExtras[i-1], "in_extras_all")); + inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, + inExtras[i-1], "in_extras_all")); llvm::Value *varyingCounter = - lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], - dimVariables[i]->storagePtr, span); + lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], + dimVariables[i]->storagePtr, span); llvm::Value *smearEnd = ctx->BroadcastValue( endVals[i], LLVMTypes::Int32VectorType, "smear_end"); @@ -1524,265 +1570,265 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // Do a vector compare of its value to the end value to generate a // mask for this last bit of work. llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); emask = ctx->I1VecToBoolVec(emask); if (i == 0) - ctx->StoreInst(emask, extrasMaskPtrs[i]); + ctx->StoreInst(emask, extrasMaskPtrs[i]); else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->StoreInst(newMask, extrasMaskPtrs[i]); + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->StoreInst(newMask, extrasMaskPtrs[i]); } llvm::Value *notAtEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[i]); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[i]); ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); - } + } - /////////////////////////////////////////////////////////////////////////// - // foreach_test (for innermost dimension) - // - // All of the outer dimensions are handled generically--basically as a - // for() loop from the start value to the end value, where at each loop - // test, we compute the mask of active elements for the current - // dimension and then update an overall mask that is the AND - // combination of all of the outer ones. - // - // The innermost loop is handled specially, for performance purposes. - // When starting the innermost dimension, we start by checking once - // whether any of the outer dimensions has set the mask to be - // partially-active or not. We follow different code paths for these - // two cases, taking advantage of the knowledge that the mask is all - // on, when this is the case. - // - // In each of these code paths, we start with a loop from the starting - // value to the aligned end value for the innermost dimension; we can - // guarantee that the innermost loop will have an "all on" mask (as far - // as its dimension is concerned) for the duration of this loop. Doing - // so allows us to emit code that assumes the mask is all on (for the - // case where none of the outer dimensions has set the mask to be - // partially on), or allows us to emit code that just uses the mask - // from the outer dimensions directly (for the case where they have). - // - // After this loop, we just need to deal with one vector's worth of - // "ragged extra bits", where the mask used includes the effect of the - // mask for the innermost dimension. - // - // We start out this process by emitting the check that determines - // whether any of the enclosing dimensions is partially active - // (i.e. processing extra elements that don't exactly fit into a - // vector). - llvm::BasicBlock *bbOuterInExtras = + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for innermost dimension) + // + // All of the outer dimensions are handled generically--basically as a + // for() loop from the start value to the end value, where at each loop + // test, we compute the mask of active elements for the current + // dimension and then update an overall mask that is the AND + // combination of all of the outer ones. + // + // The innermost loop is handled specially, for performance purposes. + // When starting the innermost dimension, we start by checking once + // whether any of the outer dimensions has set the mask to be + // partially-active or not. We follow different code paths for these + // two cases, taking advantage of the knowledge that the mask is all + // on, when this is the case. + // + // In each of these code paths, we start with a loop from the starting + // value to the aligned end value for the innermost dimension; we can + // guarantee that the innermost loop will have an "all on" mask (as far + // as its dimension is concerned) for the duration of this loop. Doing + // so allows us to emit code that assumes the mask is all on (for the + // case where none of the outer dimensions has set the mask to be + // partially on), or allows us to emit code that just uses the mask + // from the outer dimensions directly (for the case where they have). + // + // After this loop, we just need to deal with one vector's worth of + // "ragged extra bits", where the mask used includes the effect of the + // mask for the innermost dimension. + // + // We start out this process by emitting the check that determines + // whether any of the enclosing dimensions is partially active + // (i.e. processing extra elements that don't exactly fit into a + // vector). + llvm::BasicBlock *bbOuterInExtras = ctx->CreateBasicBlock("outer_in_extras"); - llvm::BasicBlock *bbOuterNotInExtras = + llvm::BasicBlock *bbOuterNotInExtras = ctx->CreateBasicBlock("outer_not_in_extras"); - ctx->SetCurrentBasicBlock(bbTest[nDims-1]); - if (inExtras.size()) + ctx->SetCurrentBasicBlock(bbTest[nDims-1]); + if (inExtras.size()) ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, - inExtras.back()); - else + inExtras.back()); + else // for a 1D iteration domain, we certainly don't have any enclosing // dimensions that are processing extra elements. ctx->BranchInst(bbOuterNotInExtras); - /////////////////////////////////////////////////////////////////////////// - // One or more outer dimensions in extras, so we need to mask for the loop - // body regardless. We break this into two cases, roughly: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on for inner, so set mask to outer mask - // // run loop body with mask - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to outermask & (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbAllInnerPartialOuter = + /////////////////////////////////////////////////////////////////////////// + // One or more outer dimensions in extras, so we need to mask for the loop + // body regardless. We break this into two cases, roughly: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on for inner, so set mask to outer mask + // // run loop body with mask + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to outermask & (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbAllInnerPartialOuter = ctx->CreateBasicBlock("all_inner_partial_outer"); - llvm::BasicBlock *bbPartial = + llvm::BasicBlock *bbPartial = ctx->CreateBasicBlock("both_partial"); - ctx->SetCurrentBasicBlock(bbOuterInExtras); { + ctx->SetCurrentBasicBlock(bbOuterInExtras); { // Update the varying counter value here, since all subsequent // blocks along this path need it. lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); + dimVariables[nDims-1]->storagePtr, span); // here we just check to see if counter < alignedEnd llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); - } + } - // Below we have a basic block that runs the loop body code for the - // case where the mask is partially but not fully on. This same block - // runs in multiple cases: both for handling any ragged extra data for - // the innermost dimension but also when outer dimensions have set the - // mask to be partially on. - // - // The value stored in stepIndexAfterMaskedBodyPtr is used after each - // execution of the body code to determine whether the innermost index - // value should be incremented by the step (we're running the "for" - // loop of full vectors at the innermost dimension, with outer - // dimensions having set the mask to be partially on), or whether we're - // running once for the ragged extra bits at the end of the innermost - // dimension, in which case we're done with the innermost dimension and - // should step the loop counter for the next enclosing dimension - // instead. - llvm::Value *stepIndexAfterMaskedBodyPtr = + // Below we have a basic block that runs the loop body code for the + // case where the mask is partially but not fully on. This same block + // runs in multiple cases: both for handling any ragged extra data for + // the innermost dimension but also when outer dimensions have set the + // mask to be partially on. + // + // The value stored in stepIndexAfterMaskedBodyPtr is used after each + // execution of the body code to determine whether the innermost index + // value should be incremented by the step (we're running the "for" + // loop of full vectors at the innermost dimension, with outer + // dimensions having set the mask to be partially on), or whether we're + // running once for the ragged extra bits at the end of the innermost + // dimension, in which case we're done with the innermost dimension and + // should step the loop counter for the next enclosing dimension + // instead. + llvm::Value *stepIndexAfterMaskedBodyPtr = ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); - /////////////////////////////////////////////////////////////////////////// - // We're in the inner loop part where the only masking is due to outer - // dimensions but the innermost dimension fits fully into a vector's - // width. Set the mask and jump to the masked loop body. - ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { + /////////////////////////////////////////////////////////////////////////// + // We're in the inner loop part where the only masking is due to outer + // dimensions but the innermost dimension fits fully into a vector's + // width. Set the mask and jump to the masked loop body. + ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { llvm::Value *mask; if (nDims == 1) - // 1D loop; we shouldn't ever get here anyway - mask = LLVMMaskAllOff; + // 1D loop; we shouldn't ever get here anyway + mask = LLVMMaskAllOff; else - mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); ctx->SetInternalMask(mask); ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); ctx->BranchInst(bbMaskedBody); - } + } - /////////////////////////////////////////////////////////////////////////// - // We need to include the effect of the innermost dimension in the mask - // for the final bits here - ctx->SetCurrentBasicBlock(bbPartial); { + /////////////////////////////////////////////////////////////////////////// + // We need to include the effect of the innermost dimension in the mask + // for the final bits here + ctx->SetCurrentBasicBlock(bbPartial); { llvm::Value *varyingCounter = - ctx->LoadInst(dimVariables[nDims-1]->storagePtr); + ctx->LoadInst(dimVariables[nDims-1]->storagePtr); llvm::Value *smearEnd = ctx->BroadcastValue( endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); emask = ctx->I1VecToBoolVec(emask); if (nDims == 1) { - ctx->SetInternalMask(emask); + ctx->SetInternalMask(emask); } else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->SetInternalMask(newMask); + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->SetInternalMask(newMask); } ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); ctx->BranchInst(bbMaskedBody); - } + } - /////////////////////////////////////////////////////////////////////////// - // None of the outer dimensions is processing extras; along the lines - // of above, we can express this as: - // for (counter = start; counter < alignedEnd; counter += step) { - // // mask is all on - // // run loop body with mask all on - // } - // // counter == alignedEnd - // if (counter < end) { - // // set mask to (counter+programCounter < end) - // // run loop body with mask - // } - llvm::BasicBlock *bbPartialInnerAllOuter = + /////////////////////////////////////////////////////////////////////////// + // None of the outer dimensions is processing extras; along the lines + // of above, we can express this as: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on + // // run loop body with mask all on + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbPartialInnerAllOuter = ctx->CreateBasicBlock("partial_inner_all_outer"); - ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { + ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, - beforeAlignedEnd); - } + beforeAlignedEnd); + } - /////////////////////////////////////////////////////////////////////////// - // full_body: do a full vector's worth of work. We know that all - // lanes will be running here, so we explicitly set the mask to be 'all - // on'. This ends up being relatively straightforward: just update the - // value of the varying loop counter and have the statements in the - // loop body emit their code. - llvm::BasicBlock *bbFullBodyContinue = + /////////////////////////////////////////////////////////////////////////// + // full_body: do a full vector's worth of work. We know that all + // lanes will be running here, so we explicitly set the mask to be 'all + // on'. This ends up being relatively straightforward: just update the + // value of the varying loop counter and have the statements in the + // loop body emit their code. + llvm::BasicBlock *bbFullBodyContinue = ctx->CreateBasicBlock("foreach_full_continue"); - ctx->SetCurrentBasicBlock(bbFullBody); { + ctx->SetCurrentBasicBlock(bbFullBody); { ctx->SetInternalMask(LLVMMaskAllOn); ctx->SetBlockEntryMask(LLVMMaskAllOn); lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); + dimVariables[nDims-1]->storagePtr, span); ctx->SetContinueTarget(bbFullBodyContinue); ctx->AddInstrumentationPoint("foreach loop body (all on)"); stmts->EmitCode(ctx); AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); ctx->BranchInst(bbFullBodyContinue); - } - ctx->SetCurrentBasicBlock(bbFullBodyContinue); { + } + ctx->SetCurrentBasicBlock(bbFullBodyContinue); { ctx->RestoreContinuedLanes(); llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); ctx->BranchInst(bbOuterNotInExtras); - } + } - /////////////////////////////////////////////////////////////////////////// - // We're done running blocks with the mask all on; see if the counter is - // less than the end value, in which case we need to run the body one - // more time to get the extra bits. - llvm::BasicBlock *bbSetInnerMask = + /////////////////////////////////////////////////////////////////////////// + // We're done running blocks with the mask all on; see if the counter is + // less than the end value, in which case we need to run the body one + // more time to get the extra bits. + llvm::BasicBlock *bbSetInnerMask = ctx->CreateBasicBlock("partial_inner_only"); - ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { + ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); llvm::Value *beforeFullEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[nDims-1], "before_full_end"); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[nDims-1], "before_full_end"); ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); - } + } - /////////////////////////////////////////////////////////////////////////// - // The outer dimensions are all on, so the mask is just given by the - // mask for the innermost dimension - ctx->SetCurrentBasicBlock(bbSetInnerMask); { + /////////////////////////////////////////////////////////////////////////// + // The outer dimensions are all on, so the mask is just given by the + // mask for the innermost dimension + ctx->SetCurrentBasicBlock(bbSetInnerMask); { llvm::Value *varyingCounter = - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); llvm::Value *smearEnd = ctx->BroadcastValue( endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); emask = ctx->I1VecToBoolVec(emask); ctx->SetInternalMask(emask); ctx->SetBlockEntryMask(emask); ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); ctx->BranchInst(bbMaskedBody); - } + } - /////////////////////////////////////////////////////////////////////////// - // masked_body: set the mask and have the statements emit their - // code again. Note that it's generally worthwhile having two copies - // of the statements' code, since the code above is emitted with the - // mask known to be all-on, which in turn leads to more efficient code - // for that case. - llvm::BasicBlock *bbStepInnerIndex = + /////////////////////////////////////////////////////////////////////////// + // masked_body: set the mask and have the statements emit their + // code again. Note that it's generally worthwhile having two copies + // of the statements' code, since the code above is emitted with the + // mask known to be all-on, which in turn leads to more efficient code + // for that case. + llvm::BasicBlock *bbStepInnerIndex = ctx->CreateBasicBlock("step_inner_index"); - llvm::BasicBlock *bbMaskedBodyContinue = + llvm::BasicBlock *bbMaskedBodyContinue = ctx->CreateBasicBlock("foreach_masked_continue"); - ctx->SetCurrentBasicBlock(bbMaskedBody); { + ctx->SetCurrentBasicBlock(bbMaskedBody); { ctx->AddInstrumentationPoint("foreach loop body (masked)"); ctx->SetContinueTarget(bbMaskedBodyContinue); ctx->DisableGatherScatterWarnings(); @@ -1790,34 +1836,495 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { stmts->EmitCode(ctx); ctx->EnableGatherScatterWarnings(); ctx->BranchInst(bbMaskedBodyContinue); - } - ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { + } + ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { ctx->RestoreContinuedLanes(); llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); - } + } - /////////////////////////////////////////////////////////////////////////// - // step the innermost index, for the case where we're doing the - // innermost for loop over full vectors. - ctx->SetCurrentBasicBlock(bbStepInnerIndex); { + /////////////////////////////////////////////////////////////////////////// + // step the innermost index, for the case where we're doing the + // innermost for loop over full vectors. + ctx->SetCurrentBasicBlock(bbStepInnerIndex); { llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); ctx->BranchInst(bbOuterInExtras); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_exit: All done. Restore the old mask and clean up + ctx->SetCurrentBasicBlock(bbExit); + + ctx->SetInternalMask(oldMask); + ctx->SetFunctionMask(oldFunctionMask); + + ctx->EndForeach(); + ctx->EndScope(); } + else /* isPTX() == true */ + { + llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body"); + llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body"); + llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit"); - /////////////////////////////////////////////////////////////////////////// - // foreach_exit: All done. Restore the old mask and clean up - ctx->SetCurrentBasicBlock(bbExit); + llvm::Value *oldMask = ctx->GetInternalMask(); + llvm::Value *oldFunctionMask = ctx->GetFunctionMask(); - ctx->SetInternalMask(oldMask); - ctx->SetFunctionMask(oldFunctionMask); + ctx->SetDebugPos(pos); + ctx->StartScope(); - ctx->EndForeach(); - ctx->EndScope(); + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetFunctionMask(LLVMMaskAllOn); + + // This should be caught during typechecking + AssertPos(pos, startExprs.size() == dimVariables.size() && + endExprs.size() == dimVariables.size()); + int nDims = (int)dimVariables.size(); + + /////////////////////////////////////////////////////////////////////// + // Setup: compute the number of items we have to work on in each + // dimension and a number of derived values. + std::vector bbReset, bbStep, bbTest; + std::vector startVals, endVals, uniformCounterPtrs; + std::vector nExtras, alignedEnd, extrasMaskPtrs; + + std::vector span(nDims, 0); + const int vectorWidth = 32; + lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); + for (int i = 0; i < nDims; i++) + { + fprintf(stderr, " i= %d [ %d ] : %d \n", + i, nDims, span[i]); + } + fprintf(stderr, " --- \n"); + + for (int i = 0; i < nDims; ++i) { + // Basic blocks that we'll fill in later with the looping logic for + // this dimension. + bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); + if (i < nDims-1) + // stepping for the innermost dimension is handled specially + bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); + bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); + + // Start and end value for this loop dimension + llvm::Value *sv = startExprs[i]->GetValue(ctx); + llvm::Value *ev = endExprs[i]->GetValue(ctx); + if (sv == NULL || ev == NULL) + return; + startVals.push_back(sv); + endVals.push_back(ev); + + // nItems = endVal - startVal + llvm::Value *nItems = + ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); + + // nExtras = nItems % (span for this dimension) + // This gives us the number of extra elements we need to deal with + // at the end of the loop for this dimension that don't fit cleanly + // into a vector width. + nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, + LLVMInt32(span[i]), "nextras")); + + // alignedEnd = endVal - nExtras + alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, + nExtras[i], "aligned_end")); + + /////////////////////////////////////////////////////////////////////// + // Each dimension has a loop counter that is a uniform value that + // goes from startVal to endVal, in steps of the span for this + // dimension. Its value is only used internally here for looping + // logic and isn't directly available in the user's program code. + uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, + "counter")); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + + // There is also a varying variable that holds the set of index + // values for each dimension in the current loop iteration; this is + // the value that is program-visible. + dimVariables[i]->storagePtr = + ctx->AllocaInst(LLVMTypes::Int32VectorType, + dimVariables[i]->name.c_str()); + dimVariables[i]->parentFunction = ctx->GetFunction(); + ctx->EmitVariableDebugInfo(dimVariables[i]); + + // Each dimension also maintains a mask that represents which of + // the varying elements in the current iteration should be + // processed. (i.e. this is used to disable the lanes that have + // out-of-bounds offsets.) + extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + } + + ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); + + // On to the outermost loop's test + ctx->BranchInst(bbTest[0]); + + /////////////////////////////////////////////////////////////////////////// + // foreach_reset: this code runs when we need to reset the counter for + // a given dimension in preparation for running through its loop again, + // after the enclosing level advances its counter. + for (int i = 0; i < nDims; ++i) { + ctx->SetCurrentBasicBlock(bbReset[i]); + if (i == 0) + ctx->BranchInst(bbExit); + else { + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + ctx->BranchInst(bbStep[i-1]); + } + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_step: increment the uniform counter by the vector width. + // Note that we don't increment the varying counter here as well but + // just generate its value when we need it in the loop body. Don't do + // this for the innermost dimension, which has a more complex stepping + // structure.. + for (int i = 0; i < nDims-1; ++i) { + ctx->SetCurrentBasicBlock(bbStep[i]); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[i]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[i]); + ctx->BranchInst(bbTest[i]); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for all dimensions other than the innermost...) + std::vector inExtras; + for (int i = 0; i < nDims-1; ++i) { + ctx->SetCurrentBasicBlock(bbTest[i]); + + llvm::Value *haveExtras = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, + endVals[i], alignedEnd[i], "have_extras"); + + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); + llvm::Value *atAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + counter, alignedEnd[i], "at_aligned_end"); + llvm::Value *inEx = + ctx->BinaryOperator(llvm::Instruction::And, haveExtras, + atAlignedEnd, "in_extras"); + + if (i == 0) + inExtras.push_back(inEx); + else + inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, + inExtras[i-1], "in_extras_all")); + + llvm::Value *varyingCounter = + lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], + dimVariables[i]->storagePtr, span); + + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[i], LLVMTypes::Int32VectorType, "smear_end"); + + // Do a vector compare of its value to the end value to generate a + // mask for this last bit of work. + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + + if (i == 0) + ctx->StoreInst(emask, extrasMaskPtrs[i]); + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->StoreInst(newMask, extrasMaskPtrs[i]); + } + + llvm::Value *notAtEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[i]); + ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_test (for innermost dimension) + // + // All of the outer dimensions are handled generically--basically as a + // for() loop from the start value to the end value, where at each loop + // test, we compute the mask of active elements for the current + // dimension and then update an overall mask that is the AND + // combination of all of the outer ones. + // + // The innermost loop is handled specially, for performance purposes. + // When starting the innermost dimension, we start by checking once + // whether any of the outer dimensions has set the mask to be + // partially-active or not. We follow different code paths for these + // two cases, taking advantage of the knowledge that the mask is all + // on, when this is the case. + // + // In each of these code paths, we start with a loop from the starting + // value to the aligned end value for the innermost dimension; we can + // guarantee that the innermost loop will have an "all on" mask (as far + // as its dimension is concerned) for the duration of this loop. Doing + // so allows us to emit code that assumes the mask is all on (for the + // case where none of the outer dimensions has set the mask to be + // partially on), or allows us to emit code that just uses the mask + // from the outer dimensions directly (for the case where they have). + // + // After this loop, we just need to deal with one vector's worth of + // "ragged extra bits", where the mask used includes the effect of the + // mask for the innermost dimension. + // + // We start out this process by emitting the check that determines + // whether any of the enclosing dimensions is partially active + // (i.e. processing extra elements that don't exactly fit into a + // vector). + llvm::BasicBlock *bbOuterInExtras = + ctx->CreateBasicBlock("outer_in_extras"); + llvm::BasicBlock *bbOuterNotInExtras = + ctx->CreateBasicBlock("outer_not_in_extras"); + + ctx->SetCurrentBasicBlock(bbTest[nDims-1]); + if (inExtras.size()) + ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, + inExtras.back()); + else + // for a 1D iteration domain, we certainly don't have any enclosing + // dimensions that are processing extra elements. + ctx->BranchInst(bbOuterNotInExtras); + + /////////////////////////////////////////////////////////////////////////// + // One or more outer dimensions in extras, so we need to mask for the loop + // body regardless. We break this into two cases, roughly: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on for inner, so set mask to outer mask + // // run loop body with mask + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to outermask & (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbAllInnerPartialOuter = + ctx->CreateBasicBlock("all_inner_partial_outer"); + llvm::BasicBlock *bbPartial = + ctx->CreateBasicBlock("both_partial"); + ctx->SetCurrentBasicBlock(bbOuterInExtras); { + // Update the varying counter value here, since all subsequent + // blocks along this path need it. + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + + // here we just check to see if counter < alignedEnd + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); + } + + // Below we have a basic block that runs the loop body code for the + // case where the mask is partially but not fully on. This same block + // runs in multiple cases: both for handling any ragged extra data for + // the innermost dimension but also when outer dimensions have set the + // mask to be partially on. + // + // The value stored in stepIndexAfterMaskedBodyPtr is used after each + // execution of the body code to determine whether the innermost index + // value should be incremented by the step (we're running the "for" + // loop of full vectors at the innermost dimension, with outer + // dimensions having set the mask to be partially on), or whether we're + // running once for the ragged extra bits at the end of the innermost + // dimension, in which case we're done with the innermost dimension and + // should step the loop counter for the next enclosing dimension + // instead. + llvm::Value *stepIndexAfterMaskedBodyPtr = + ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); + + /////////////////////////////////////////////////////////////////////////// + // We're in the inner loop part where the only masking is due to outer + // dimensions but the innermost dimension fits fully into a vector's + // width. Set the mask and jump to the masked loop body. + ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { + llvm::Value *mask; + if (nDims == 1) + // 1D loop; we shouldn't ever get here anyway + mask = LLVMMaskAllOff; + else + mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + + ctx->SetInternalMask(mask); + + ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // We need to include the effect of the innermost dimension in the mask + // for the final bits here + ctx->SetCurrentBasicBlock(bbPartial); { + llvm::Value *varyingCounter = + ctx->LoadInst(dimVariables[nDims-1]->storagePtr); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + + if (nDims == 1) { + ctx->SetInternalMask(emask); + } + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->SetInternalMask(newMask); + } + + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // None of the outer dimensions is processing extras; along the lines + // of above, we can express this as: + // for (counter = start; counter < alignedEnd; counter += step) { + // // mask is all on + // // run loop body with mask all on + // } + // // counter == alignedEnd + // if (counter < end) { + // // set mask to (counter+programCounter < end) + // // run loop body with mask + // } + llvm::BasicBlock *bbPartialInnerAllOuter = + ctx->CreateBasicBlock("partial_inner_all_outer"); + ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, + beforeAlignedEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // full_body: do a full vector's worth of work. We know that all + // lanes will be running here, so we explicitly set the mask to be 'all + // on'. This ends up being relatively straightforward: just update the + // value of the varying loop counter and have the statements in the + // loop body emit their code. + llvm::BasicBlock *bbFullBodyContinue = + ctx->CreateBasicBlock("foreach_full_continue"); + ctx->SetCurrentBasicBlock(bbFullBody); { + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetBlockEntryMask(LLVMMaskAllOn); + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + ctx->SetContinueTarget(bbFullBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (all on)"); + stmts->EmitCode(ctx); + AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); + ctx->BranchInst(bbFullBodyContinue); + } + ctx->SetCurrentBasicBlock(bbFullBodyContinue); { + ctx->RestoreContinuedLanes(); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterNotInExtras); + } + + /////////////////////////////////////////////////////////////////////////// + // We're done running blocks with the mask all on; see if the counter is + // less than the end value, in which case we need to run the body one + // more time to get the extra bits. + llvm::BasicBlock *bbSetInnerMask = + ctx->CreateBasicBlock("partial_inner_only"); + ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeFullEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[nDims-1], "before_full_end"); + ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); + } + + /////////////////////////////////////////////////////////////////////////// + // The outer dimensions are all on, so the mask is just given by the + // mask for the innermost dimension + ctx->SetCurrentBasicBlock(bbSetInnerMask); { + llvm::Value *varyingCounter = + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + ctx->SetInternalMask(emask); + ctx->SetBlockEntryMask(emask); + + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); + } + + /////////////////////////////////////////////////////////////////////////// + // masked_body: set the mask and have the statements emit their + // code again. Note that it's generally worthwhile having two copies + // of the statements' code, since the code above is emitted with the + // mask known to be all-on, which in turn leads to more efficient code + // for that case. + llvm::BasicBlock *bbStepInnerIndex = + ctx->CreateBasicBlock("step_inner_index"); + llvm::BasicBlock *bbMaskedBodyContinue = + ctx->CreateBasicBlock("foreach_masked_continue"); + ctx->SetCurrentBasicBlock(bbMaskedBody); { + ctx->AddInstrumentationPoint("foreach loop body (masked)"); + ctx->SetContinueTarget(bbMaskedBodyContinue); + ctx->DisableGatherScatterWarnings(); + ctx->SetBlockEntryMask(ctx->GetFullMask()); + stmts->EmitCode(ctx); + ctx->EnableGatherScatterWarnings(); + ctx->BranchInst(bbMaskedBodyContinue); + } + ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { + ctx->RestoreContinuedLanes(); + llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); + } + + /////////////////////////////////////////////////////////////////////////// + // step the innermost index, for the case where we're doing the + // innermost for loop over full vectors. + ctx->SetCurrentBasicBlock(bbStepInnerIndex); { + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterInExtras); + } + + /////////////////////////////////////////////////////////////////////////// + // foreach_exit: All done. Restore the old mask and clean up + ctx->SetCurrentBasicBlock(bbExit); + + ctx->SetInternalMask(oldMask); + ctx->SetFunctionMask(oldFunctionMask); + + ctx->EndForeach(); + ctx->EndScope(); + } }