diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index d654ac3ec9273..64efd3ca32200 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -407,6 +407,13 @@ class LoopVectorizationLegality { return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr; } + /// Returns true if this is an early exit loop containing a store. + bool isConditionCopyRequired() const { return RequiresEarlyExitConditionCopy; } + + /// Returns the load instruction, if any, nearest to an uncountable early + /// exit. + std::optional getEarlyExitLoad() const { return EarlyExitLoad; } + /// Return true if there is store-load forwarding dependencies. bool isSafeForAnyStoreLoadForwardDistances() const { return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances(); @@ -654,6 +661,16 @@ class LoopVectorizationLegality { /// Keep track of the loop edge to an uncountable exit, comprising a pair /// of (Exiting, Exit) blocks, if there is exactly one early exit. std::optional> UncountableEdge; + + /// Indicates that we will need to copy the early exit condition into + /// the vector preheader, as we will need to mask some operations in + /// the loop (e.g. stores). + bool RequiresEarlyExitConditionCopy = false; + + /// The load used to determine an uncountable early-exit condition. This is + /// only used to allow further analysis in canVectorizeMemory if we found + /// what looks like a valid early exit loop with store beforehand. + std::optional EarlyExitLoad; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 3ec6850d6f685..9b4e1751ab11d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -1209,6 +1210,36 @@ bool LoopVectorizationLegality::canVectorizeMemory() { }); } + // FIXME: Remove or reduce this restriction. We're in a bit of an odd spot + // since we're (potentially) doing the load out of its normal order + // in the loop and that may throw off dependency checking. + // A forward dependency should be fine, but a backwards dep may not + // be even if LAA thinks it is due to performing the load for the + // vector iteration i+1 in vector iteration i. + if (isConditionCopyRequired()) { + assert(EarlyExitLoad.has_value() && "EE Store without condition load."); + + if (LAI->canVectorizeMemory()) { + const MemoryDepChecker &DepChecker = LAI->getDepChecker(); + const auto *Deps = DepChecker.getDependences(); + + for (const MemoryDepChecker::Dependence &Dep : *Deps) { + if (Dep.getDestination(DepChecker) == EarlyExitLoad || + Dep.getSource(DepChecker) == EarlyExitLoad) { + // Refine language a little? This currently only applies when a store + // is present in the early exit loop. + reportVectorizationFailure( + "No dependencies allowed for early exit condition load", + "Early exit condition loads may not have a dependence with another" + " memory operation.", + "CantVectorizeStoreToLoopInvariantAddress", ORE, + TheLoop); + return false; + } + } + } + } + if (!LAI->canVectorizeMemory()) return canVectorizeIndirectUnsafeDependences(); @@ -1627,6 +1658,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // Keep a record of all the exiting blocks. SmallVector Predicates; std::optional> SingleUncountableEdge; + std::optional EELoad; for (BasicBlock *BB : ExitingBlocks) { const SCEV *EC = PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates); @@ -1656,6 +1688,21 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { return false; } + // For loops with stores. + // Record load for analysis by isDereferenceableAndAlignedInLoop + // and later by dependence analysis. + if (BranchInst *Br = dyn_cast(BB->getTerminator())) { + // FIXME: Handle exit conditions with multiple users, more complex exit + // conditions than br(icmp(load, loop_inv)). + ICmpInst *Cmp = dyn_cast(Br->getCondition()); + if (Cmp && Cmp->hasOneUse() && + TheLoop->isLoopInvariant(Cmp->getOperand(1))) { + LoadInst *Load = dyn_cast(Cmp->getOperand(0)); + if (Load && Load->hasOneUse() && TheLoop->contains(Load)) + EELoad = Load; + } + } + SingleUncountableEdge = {BB, ExitBlock}; } else CountableExitingBlocks.push_back(BB); @@ -1708,16 +1755,31 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { } }; + bool HasStore = false; for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { + if (StoreInst *SI = dyn_cast(&I)) { + HasStore = true; + if (SI->isSimple()) + continue; + + reportVectorizationFailure( + "Complex writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with complex writes to memory", + "WritesInEarlyExitLoop", ORE, TheLoop); + return false; + } + if (I.mayWriteToMemory()) { // We don't support writes to memory. reportVectorizationFailure( - "Writes to memory unsupported in early exit loops", - "Cannot vectorize early exit loop with writes to memory", + "Complex writes to memory unsupported in early exit loops", + "Cannot vectorize early exit loop with complex writes to memory", "WritesInEarlyExitLoop", ORE, TheLoop); return false; - } else if (!IsSafeOperation(&I)) { + } + + if (!IsSafeOperation(&I)) { reportVectorizationFailure("Early exit loop contains operations that " "cannot be speculatively executed", "UnsafeOperationsEarlyExitLoop", ORE, @@ -1732,13 +1794,53 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { // TODO: Handle loops that may fault. Predicates.clear(); - if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, - &Predicates)) { + + if (HasStore && EELoad.has_value()) { + LoadInst *LI = *EELoad; + if (isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(), *DT, AC, + &Predicates)) { + ICFLoopSafetyInfo SafetyInfo; + SafetyInfo.computeLoopSafetyInfo(TheLoop); + // FIXME: We may have multiple levels of conditional loads, so will + // need to improve on outright rejection at some point. + if (!SafetyInfo.isGuaranteedToExecute(*LI, DT, TheLoop)) { + LLVM_DEBUG( + dbgs() << "Early exit condition load not guaranteed to execute.\n"); + reportVectorizationFailure( + "Early exit condition load not guaranteed to execute", + "Cannot vectorize early exit loop when condition load is not " + "guaranteed to execute", + "EarlyExitLoadNotGuaranteed", ORE, TheLoop); + } + } else { + LLVM_DEBUG(dbgs() << "Early exit condition load potentially unsafe.\n"); + reportVectorizationFailure("Uncounted loop condition not known safe", + "Cannot vectorize early exit loop with " + "possibly unsafe condition load", + "PotentiallyFaultingEarlyExitLoop", ORE, + TheLoop); + return false; + } + } else if (HasStore) { + LLVM_DEBUG(dbgs() << "Found early exit store but no condition load.\n"); reportVectorizationFailure( - "Loop may fault", - "Cannot vectorize potentially faulting early exit loop", - "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + "Early exit loop with store but no condition load", + "Cannot vectorize early exit loop with store but no condition load", + "NoConditionLoadForEarlyExitLoop", ORE, TheLoop); return false; + } else { + // Read-only loop. + // FIXME: as with the loops with stores, only the loads contributing to + // the loop condition need to be guaranteed dereferenceable and + // aligned. + if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC, + &Predicates)) { + reportVectorizationFailure( + "Loop may fault", + "Cannot vectorize potentially faulting early exit loop", + "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop); + return false; + } } [[maybe_unused]] const SCEV *SymbolicMaxBTC = @@ -1751,6 +1853,11 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() { "backedge taken count: " << *SymbolicMaxBTC << '\n'); UncountableEdge = SingleUncountableEdge; + if (HasStore) { + RequiresEarlyExitConditionCopy = true; + EarlyExitLoad = EELoad; + } + return true; } @@ -1823,6 +1930,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { } else { if (!isVectorizableEarlyExitLoop()) { UncountableEdge = std::nullopt; + EarlyExitLoad = std::nullopt; + RequiresEarlyExitConditionCopy = false; if (DoExtraAnalysis) Result = false; else diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4c1ed15ee700f..7dca356edca65 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9246,6 +9246,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths, *Plan, CM.getMinimalBitwidths()); VPlanTransforms::optimize(*Plan); + + // See if we can convert an early exit vplan to bail out to a scalar + // loop if state-changing operations (like stores) are present and + // an exit will be taken in the next vector iteration. + // If not, discard the plan. + if (Legal->isConditionCopyRequired() && !HasScalarVF && + !VPlanTransforms::runPass(VPlanTransforms::tryEarlyExitConversion, + *Plan)) + break; // TODO: try to put it close to addActiveLaneMask(). // Discard the plan if it is not EVL-compatible if (CM.foldTailWithEVL() && !HasScalarVF && @@ -9570,6 +9579,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { }, Range); auto Plan = std::make_unique(OrigLoop); + + // FIXME: Better place to put this? Or maybe an enum for how to handle + // early exits? + if (Legal->hasUncountableEarlyExit()) + Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired()); + // Build hierarchical CFG. // TODO: Convert to VPlan-transform and consolidate all transforms for VPlan // creation. @@ -9876,6 +9891,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) { // Create new empty VPlan auto Plan = std::make_unique(OrigLoop); + + // FIXME: Better place to put this? Or maybe an enum for how to handle + // early exits? + if (Legal->hasUncountableEarlyExit()) + Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired()); + // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); HCFGBuilder.buildPlainCFG(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 7084676af6d5b..5320290049b42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3522,6 +3522,13 @@ class VPlan { /// VPlan is destroyed. SmallVector CreatedBlocks; + /// Indicates that an early exit loop will exit before the condition is + /// reached, and that the scalar loop must perform the last few iterations. + /// FIXME: Is this the right place? We mainly want to make sure that we + /// know about this for transforming the plan to copy&move the exit + /// condition, but maybe it doesn't need to be in the plan itself. + bool EarlyExitContinuesInScalarLoop = false; + /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader /// wrapping the original header of the scalar loop. VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) @@ -3825,6 +3832,16 @@ class VPlan { return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1; } + /// Returns true if all exit paths should reach the scalar loop. + bool shouldEarlyExitContinueInScalarLoop() const { + return EarlyExitContinuesInScalarLoop; + } + + /// Set early exit vectorization to always reach the scalar loop. + void setEarlyExitContinuesInScalarLoop(bool Continues) { + EarlyExitContinuesInScalarLoop = Continues; + } + /// Returns true if the scalar tail may execute after the vector loop. Note /// that this relies on unneeded branches to the scalar tail loop being /// removed. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d8f1706cf61b..aef3d9b728ea5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2469,61 +2469,63 @@ void VPlanTransforms::handleUncountableEarlyExit( // block and have it conditionally branch to the early exit block if // EarlyExitTaken. auto *EarlyExitingBranch = - cast(UncountableExitingBlock->getTerminator()); + cast(UncountableExitingBlock->getTerminator()); BasicBlock *TrueSucc = EarlyExitingBranch->getSuccessor(0); BasicBlock *FalseSucc = EarlyExitingBranch->getSuccessor(1); BasicBlock *EarlyExitIRBB = - !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc; + !OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc; VPIRBasicBlock *VPEarlyExitBlock = Plan.getExitBlock(EarlyExitIRBB); - + VPValue *EarlyExitNotTakenCond = RecipeBuilder.getBlockInMask( - OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); - auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond); - IsEarlyExitTaken = - Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); - - VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); - VPBasicBlock *VectorEarlyExitVPBB = - Plan.createVPBasicBlock("vector.early.exit"); - VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); - VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB); - NewMiddle->swapSuccessors(); - - VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock); - - // Update the exit phis in the early exit block. - VPBuilder MiddleBuilder(NewMiddle); - VPBuilder EarlyExitB(VectorEarlyExitVPBB); - for (VPRecipeBase &R : *VPEarlyExitBlock) { - auto *ExitIRI = dyn_cast(&R); - if (!ExitIRI) - break; + OrigLoop->contains(TrueSucc) ? TrueSucc : FalseSucc); + auto *EarlyExitTakenCond = Builder.createNot(EarlyExitNotTakenCond); + IsEarlyExitTaken = + Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond}); + + if (!Plan.shouldEarlyExitContinueInScalarLoop()) { + VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split"); + VPBasicBlock *VectorEarlyExitVPBB = + Plan.createVPBasicBlock("vector.early.exit"); + VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle); + VPBlockUtils::connectBlocks(NewMiddle, VectorEarlyExitVPBB); + NewMiddle->swapSuccessors(); + + VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, VPEarlyExitBlock); + + // Update the exit phis in the early exit block. + VPBuilder MiddleBuilder(NewMiddle); + VPBuilder EarlyExitB(VectorEarlyExitVPBB); + for (VPRecipeBase &R : *VPEarlyExitBlock) { + auto *ExitIRI = dyn_cast(&R); + if (!ExitIRI) + break; - PHINode &ExitPhi = ExitIRI->getIRPhi(); - VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn( - ExitPhi.getIncomingValueForBlock(UncountableExitingBlock)); - - if (OrigLoop->getUniqueExitBlock()) { - // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors - // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB - // which is coming from the original latch. - VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn( - ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch())); - ExitIRI->addOperand(IncomingFromLatch); - ExitIRI->extractLastLaneOfOperand(MiddleBuilder); - } - // Add the incoming value from the early exit. - if (!IncomingFromEarlyExit->isLiveIn() && !Plan.hasScalarVFOnly()) { - VPValue *FirstActiveLane = EarlyExitB.createNaryOp( - VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr, - "first.active.lane"); - IncomingFromEarlyExit = EarlyExitB.createNaryOp( - Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane}, - nullptr, "early.exit.value"); + PHINode &ExitPhi = ExitIRI->getIRPhi(); + VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn( + ExitPhi.getIncomingValueForBlock(UncountableExitingBlock)); + + if (OrigLoop->getUniqueExitBlock()) { + // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors + // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB + // which is coming from the original latch. + VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn( + ExitPhi.getIncomingValueForBlock(OrigLoop->getLoopLatch())); + ExitIRI->addOperand(IncomingFromLatch); + ExitIRI->extractLastLaneOfOperand(MiddleBuilder); + } + // Add the incoming value from the early exit. + if (!IncomingFromEarlyExit->isLiveIn() && !Plan.hasScalarVFOnly()) { + VPValue *FirstActiveLane = EarlyExitB.createNaryOp( + VPInstruction::FirstActiveLane, {EarlyExitTakenCond}, nullptr, + "first.active.lane"); + IncomingFromEarlyExit = EarlyExitB.createNaryOp( + Instruction::ExtractElement, {IncomingFromEarlyExit, FirstActiveLane}, + nullptr, "early.exit.value"); + } + ExitIRI->addOperand(IncomingFromEarlyExit); } - ExitIRI->addOperand(IncomingFromEarlyExit); + MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); } - MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken}); // Replace the condition controlling the non-early exit from the vector loop // with one exiting if either the original condition of the vector latch is @@ -2538,6 +2540,115 @@ void VPlanTransforms::handleUncountableEarlyExit( Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken}); Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); LatchExitingBranch->eraseFromParent(); + + // If the counted exit wasn't taken, continue in the scalar loop. + if (Plan.shouldEarlyExitContinueInScalarLoop()) { + VPRecipeBase *OldTerminator = MiddleVPBB->getTerminator(); + VPBuilder Builder(OldTerminator); + Builder.createNaryOp(VPInstruction::BranchOnCond, IsLatchExitTaken); + OldTerminator->eraseFromParent(); + } +} + +bool VPlanTransforms::tryEarlyExitConversion(VPlan &Plan) { + // We can abandon a vplan entirely if we return false here, so we shouldn't + // crash if some earlier assumptions on scalar IR don't hold for the vplan + // version of the loop. + if (Plan.hasScalarVFOnly()) + return false; + auto *Region = Plan.getVectorLoopRegion(); + auto *Branch = dyn_cast(Region->getExitingBasicBlock()->getTerminator()); + if (Branch->getOpcode() != VPInstruction::BranchOnCond) + return false; + + // Extract the IR needed to create the uncountable exit condition. + // Looking for br(or(any_of(icmp(load(gep(base, iv)), loop_inv)), counted) + // FIXME: Build a list of nodes to copy below instead of matching + // the exact pattern. + // FIXME: We should be able to handle multiple users for at least some of + // these nodes; requires creating phis. + // FIXME: This does feel a bit fragile; is it better to do this earlier + // when creating the initial recipe based on the scalar IR, instead + // of the vplan equivalent here? + // FIXME: New vplan pattern matchers; m_Load, m_ICmp, m_OneUse, etc. + VPInstruction *Or = dyn_cast(Branch->getOperand(0)); + if (!Or || Or->Users.size() != 1 || Or->getOpcode() != Instruction::Or) + return false; + auto *AnyOf = dyn_cast(Or->getOperand(0)); + if (!AnyOf || AnyOf->Users.size() != 1 || + AnyOf->getOpcode() != VPInstruction::AnyOf) + return false; + auto *Cmp = dyn_cast(AnyOf->getOperand(0)); + if (!Cmp || Cmp->Users.size() != 1 || Cmp->getOpcode() != Instruction::ICmp || + !Cmp->getOperand(1)->isDefinedOutsideLoopRegions()) + return false; + auto *Load = dyn_cast(Cmp->getOperand(0)); + if (!Load || Load->Users.size() != 1) + return false; + auto *VecPtr = dyn_cast(Load->getAddr()); + if (!VecPtr || VecPtr->Users.size() != 1) + return false; + auto *GEP = dyn_cast(VecPtr->getOperand(0)); + if (!GEP || GEP->Users.size() != 1 || GEP->getNumOperands() != 2 || + GEP->getOpcode() != Instruction::GetElementPtr) + return false; + auto *Base = GEP->getOperand(0); + if (!Base->isDefinedOutsideLoopRegions()) + return false; + auto *Steps = dyn_cast(GEP->getOperand(1)); + if (!Steps) + return false; + auto *IV = dyn_cast(Steps->getOperand(0)); + if (!IV) + return false; + VPInstruction *IVUpdate = cast(IV->getBackedgeValue()); + + // Duplicate exit IR and use the starting value for the IV phi. + // FIXME: Is making new nodes better than cloning? + auto *VectorPH = Plan.getVectorPreheader(); + VPBuilder PHBuilder(VectorPH, VectorPH->getFirstNonPhi()); + VPReplicateRecipe *PHGEP = GEP->clone(); + PHGEP->setOperand(1, IV->getStartValue()); + PHBuilder.insert(PHGEP); + auto *PHVecPtr = VecPtr->clone(); + PHVecPtr->setOperand(0, PHGEP); + PHBuilder.insert(PHVecPtr); + VPWidenLoadRecipe *PHLoad = Load->clone(); + PHLoad->setOperand(0, PHVecPtr); + PHBuilder.insert(PHLoad); + VPWidenRecipe *PHCmp = Cmp->clone(); + PHCmp->setOperand(0, PHLoad); + PHBuilder.insert(PHCmp); + + // Split vector preheader to form a new bypass block. + VPBasicBlock *NewPH = VectorPH->splitAt(PHBuilder.getInsertPoint()); + VPBasicBlock *ScalarPH = Plan.getScalarPreheader(); + VPValue *PHAnyOf = PHBuilder.createNaryOp(VPInstruction::AnyOf, {PHCmp}); + PHBuilder.createNaryOp(VPInstruction::BranchOnCond, {PHAnyOf}, + PHCmp->getDebugLoc()); + VectorPH->clearSuccessors(); + VectorPH->setTwoSuccessors(ScalarPH, NewPH); + + // Fix up the resume phi in scalar preheader -- we might not have reached + // the calculated maximum vector tripcount, so just use the next value of IV. + // FIXME: Can we rely on the resume phi being first? + auto *ResumePHI = cast(&ScalarPH->front()); + VPBasicBlock *MiddleBlock = Plan.getMiddleBlock(); + ScalarPH->clearPredecessors(); + ScalarPH->setPredecessors({MiddleBlock, VectorPH}); + ResumePHI->addOperand(ResumePHI->getOperand(1)); + ResumePHI->setOperand(0, IVUpdate); + + // FIXME: May be better to move the IVUpdate before the GEP instead? + // FIXME: No domtree available at the recipe level, just for vpbasicblocks. + // Can we find iterator distances in the same block? + GEP->moveAfter(IVUpdate); + GEP->setOperand(1, IVUpdate); + VecPtr->moveAfter(GEP); + Load->moveAfter(VecPtr); + Cmp->moveAfter(Load); + + return true; } void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a9461b261ddb6..d28dc9fe30fa6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -152,6 +152,8 @@ struct VPlanTransforms { tryAddExplicitVectorLength(VPlan &Plan, const std::optional &MaxEVLSafeElements); + static bool tryEarlyExitConversion(VPlan &Plan); + // For each Interleave Group in \p InterleaveGroups replace the Recipes // widening its memory instructions with a single VPInterleaveRecipe at its // insertion point. diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 3f10b1756d7a1..0e59530bf21d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -212,7 +212,8 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { isa(UI) || (isa(UI) && - cast(UI)->getOpcode() == Instruction::PHI)) + (cast(UI)->getOpcode() == Instruction::PHI || + cast(UI)->getOpcode() == VPInstruction::ResumePhi))) continue; // If the user is in the same block, check it comes after R in the diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 7d5b73477f6ed..85ab08b91abd5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -580,6 +580,81 @@ loop.end: ret i64 %retval } +define void @loop_contains_store_single_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { +; CHECK-LABEL: define void @loop_contains_store_single_user( +; CHECK-SAME: ptr noalias dereferenceable(40) [[ARRAY:%.*]], ptr readonly align 2 dereferenceable(40) [[PRED:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], splat (i16 500) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH]], label [[VECTOR_PH_SPLIT:%.*]] +; CHECK: vector.ph.split: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_SPLIT]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i16, ptr [[ARRAY]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i16> [[WIDE_LOAD1]], splat (i16 1) +; CHECK-NEXT: store <4 x i16> [[TMP6]], ptr [[TMP5]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD2]], splat (i16 500) +; CHECK-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 +; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP10]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 [[TMP11]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_PH]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds nuw i16, ptr [[ARRAY]], i64 [[IV]] +; CHECK-NEXT: [[DATA:%.*]] = load i16, ptr [[ST_ADDR]], align 2 +; CHECK-NEXT: [[INC:%.*]] = add nsw i16 [[DATA]], 1 +; CHECK-NEXT: store i16 [[INC]], ptr [[ST_ADDR]], align 2 +; CHECK-NEXT: [[EE_ADDR:%.*]] = getelementptr inbounds nuw i16, ptr [[PRED]], i64 [[IV]] +; CHECK-NEXT: [[EE_VAL:%.*]] = load i16, ptr [[EE_ADDR]], align 2 +; CHECK-NEXT: [[EE_COND:%.*]] = icmp sgt i16 [[EE_VAL]], 500 +; CHECK-NEXT: br i1 [[EE_COND]], label [[EXIT]], label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[COUNTED_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 20 +; CHECK-NEXT: br i1 [[COUNTED_COND]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + declare i32 @foo(i32) readonly declare @foo_vec() @@ -600,4 +675,6 @@ attributes #1 = { "target-features"="+sve" vscale_range(1,16) } ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll index 3a8aec34dfe43..2578260fe878d 100644 --- a/llvm/test/Transforms/LoopVectorize/control-flow.ll +++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll @@ -10,7 +10,7 @@ ; return 0; ; } -; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with writes to memory +; CHECK: remark: source.cpp:5:9: loop not vectorized: Cannot vectorize early exit loop with possibly unsafe condition load ; CHECK: remark: source.cpp:5:9: loop not vectorized ; CHECK: _Z4testPii diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll index de455c81d363e..6eb9fc2adeb70 100644 --- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll +++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll @@ -445,7 +445,7 @@ loop.end: define i64 @loop_contains_store(ptr %dest) { ; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store' -; CHECK: LV: Not vectorizing: Writes to memory unsupported in early exit loops +; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load. entry: %p1 = alloca [1024 x i8] call void @init_mem(ptr %p1, i64 1024) @@ -470,6 +470,192 @@ loop.end: ret i64 %retval } +define void @loop_contains_store_single_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_single_user' +; CHECK: LV: We can vectorize this loop! +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @loop_contains_store_multi_user(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_multi_user' +; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load. +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + %unused = add i16 %ee.val, 42 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @loop_contains_store_fcmp(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_fcmp' +; CHECK: LV: Not vectorizing: Early exit loop with store but no condition load. +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw half, ptr %pred, i64 %iv + %ee.val = load half, ptr %ee.addr, align 2 + %ee.cond = fcmp ugt half %ee.val, 500.0 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @loop_contains_store_safe_dependency(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(80) readonly %pred) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_safe_dependency' +; CHECK: LV: Not vectorizing: No dependencies allowed for early exit condition load. +entry: + %forward = getelementptr i16, ptr %pred, i64 -8 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + %some.addr = getelementptr inbounds nuw i16, ptr %forward, i64 %iv + store i16 42, ptr %some.addr, align 2 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @loop_contains_store_assumed_bounds(ptr noalias %array, ptr readonly %pred, i32 %n) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_assumed_bounds' +; CHECK: LV: Not vectorizing: Uncounted loop condition not known safe. +entry: + %n_bytes = mul nuw nsw i32 %n, 2 + call void @llvm.assume(i1 true) [ "align"(ptr %pred, i64 2), "dereferenceable"(ptr %pred, i32 %n_bytes) ] + %tc = sext i32 %n to i64 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, %tc + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @loop_contains_store_volatile(ptr dereferenceable(40) noalias %array, ptr align 2 dereferenceable(40) readonly %pred) { +; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store_volatile' +; CHECK: LV: Not vectorizing: Complex writes to memory unsupported in early exit loops. +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store volatile i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + br i1 %ee.cond, label %exit, label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + br i1 %counted.cond, label %exit, label %for.body + +exit: + ret void +} + +define void @exit_conditions_combined(ptr noalias dereferenceable(40) %array, ptr readonly align 2 dereferenceable(40) %pred) { +; CHECK-LABEL: LV: Checking a loop in 'exit_conditions_combined' +; CHECK: LV: Not vectorizing: Cannot vectorize uncountable loop. +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %st.addr = getelementptr inbounds nuw i16, ptr %array, i64 %iv + %data = load i16, ptr %st.addr, align 2 + %inc = add nsw i16 %data, 1 + store i16 %inc, ptr %st.addr, align 2 + %ee.addr = getelementptr inbounds nuw i16, ptr %pred, i64 %iv + %ee.val = load i16, ptr %ee.addr, align 2 + %ee.cond = icmp sgt i16 %ee.val, 500 + %iv.next = add nuw nsw i64 %iv, 1 + %counted.cond = icmp eq i64 %iv.next, 20 + %or.cond = select i1 %ee.cond, i1 true, i1 %counted.cond + br i1 %or.cond, label %exit, label %for.body + +exit: ; preds = %for.body + ret void +} define i64 @uncountable_exit_in_conditional_block(ptr %mask) { ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'