llvm · huntergr-arm · Apr 4, 2025 · david-arm · May 9, 2025 · huntergr-arm
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -407,6 +407,13 @@ class LoopVectorizationLegality {
     return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr;
   }
 
+  /// Returns true if this is an early exit loop containing a store.
+  bool isConditionCopyRequired() const { return RequiresEarlyExitConditionCopy; }
+
+  /// Returns the load instruction, if any, nearest to an uncountable early
+  /// exit.
+  std::optional<LoadInst *> getEarlyExitLoad() const { return EarlyExitLoad; }
+
   /// Return true if there is store-load forwarding dependencies.
   bool isSafeForAnyStoreLoadForwardDistances() const {
     return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
@@ -654,6 +661,16 @@ class LoopVectorizationLegality {
   /// Keep track of the loop edge to an uncountable exit, comprising a pair
   /// of (Exiting, Exit) blocks, if there is exactly one early exit.
   std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;
+
+  /// Indicates that we will need to copy the early exit condition into
+  /// the vector preheader, as we will need to mask some operations in
+  /// the loop (e.g. stores).
+  bool RequiresEarlyExitConditionCopy = false;
+
+  /// The load used to determine an uncountable early-exit condition. This is
+  /// only used to allow further analysis in canVectorizeMemory if we found
+  /// what looks like a valid early exit loop with store beforehand.
+  std::optional<LoadInst *> EarlyExitLoad;
 };
 
 } // namespace llvm

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -1209,6 +1210,36 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     });
   }
 
+  // FIXME: Remove or reduce this restriction. We're in a bit of an odd spot
+  //        since we're (potentially) doing the load out of its normal order
+  //        in the loop and that may throw off dependency checking.
+  //        A forward dependency should be fine, but a backwards dep may not
+  //        be even if LAA thinks it is due to performing the load for the
+  //        vector iteration i+1 in vector iteration i.
+  if (isConditionCopyRequired()) {
+    assert(EarlyExitLoad.has_value() && "EE Store without condition load.");
+
+    if (LAI->canVectorizeMemory()) {
+      const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+      const auto *Deps = DepChecker.getDependences();
+
+      for (const MemoryDepChecker::Dependence &Dep : *Deps) {
+        if (Dep.getDestination(DepChecker) == EarlyExitLoad ||
+            Dep.getSource(DepChecker) == EarlyExitLoad) {
+          // Refine language a little? This currently only applies when a store
+          // is present in the early exit loop.
+          reportVectorizationFailure(
+            "No dependencies allowed for early exit condition load",
+            "Early exit condition loads may not have a dependence with another"
+            " memory operation.",
+            "CantVectorizeStoreToLoopInvariantAddress", ORE,
+            TheLoop);
+          return false;
+        }
+      }
+    }
+  }
+
   if (!LAI->canVectorizeMemory())
     return canVectorizeIndirectUnsafeDependences();
 
@@ -1627,6 +1658,7 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   // Keep a record of all the exiting blocks.
   SmallVector<const SCEVPredicate *, 4> Predicates;
   std::optional<std::pair<BasicBlock *, BasicBlock *>> SingleUncountableEdge;
+  std::optional<LoadInst *> EELoad;
   for (BasicBlock *BB : ExitingBlocks) {
     const SCEV *EC =
         PSE.getSE()->getPredicatedExitCount(TheLoop, BB, &Predicates);
@@ -1656,6 +1688,21 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
         return false;
       }
 
+      // For loops with stores.
+      // Record load for analysis by isDereferenceableAndAlignedInLoop
+      // and later by dependence analysis.
+      if (BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator())) {
+        // FIXME: Handle exit conditions with multiple users, more complex exit
+        //        conditions than br(icmp(load, loop_inv)).
+        ICmpInst *Cmp = dyn_cast<ICmpInst>(Br->getCondition());
+        if (Cmp && Cmp->hasOneUse() &&
+            TheLoop->isLoopInvariant(Cmp->getOperand(1))) {
+          LoadInst *Load = dyn_cast<LoadInst>(Cmp->getOperand(0));
+          if (Load && Load->hasOneUse() && TheLoop->contains(Load))
+            EELoad = Load;
+        }
+      }
+
       SingleUncountableEdge = {BB, ExitBlock};
     } else
       CountableExitingBlocks.push_back(BB);
@@ -1708,16 +1755,31 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
     }
   };
 
+  bool HasStore = false;
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+        HasStore = true;
+        if (SI->isSimple())
+          continue;
+
+        reportVectorizationFailure(
+            "Complex writes to memory unsupported in early exit loops",
+            "Cannot vectorize early exit loop with complex writes to memory",
+            "WritesInEarlyExitLoop", ORE, TheLoop);
+        return false;
+      }
+
       if (I.mayWriteToMemory()) {
         // We don't support writes to memory.
         reportVectorizationFailure(
-            "Writes to memory unsupported in early exit loops",
-            "Cannot vectorize early exit loop with writes to memory",
+            "Complex writes to memory unsupported in early exit loops",
+            "Cannot vectorize early exit loop with complex writes to memory",
             "WritesInEarlyExitLoop", ORE, TheLoop);
         return false;
-      } else if (!IsSafeOperation(&I)) {
+      }
+
+      if (!IsSafeOperation(&I)) {
         reportVectorizationFailure("Early exit loop contains operations that "
                                    "cannot be speculatively executed",
                                    "UnsafeOperationsEarlyExitLoop", ORE,
@@ -1732,13 +1794,53 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
 
   // TODO: Handle loops that may fault.
   Predicates.clear();
-  if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
-                                     &Predicates)) {
+
+  if (HasStore && EELoad.has_value()) {
+    LoadInst *LI = *EELoad;
+    if (isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(), *DT, AC,
+                                          &Predicates)) {
+      ICFLoopSafetyInfo SafetyInfo;
+      SafetyInfo.computeLoopSafetyInfo(TheLoop);
+      // FIXME: We may have multiple levels of conditional loads, so will
+      //        need to improve on outright rejection at some point.
+      if (!SafetyInfo.isGuaranteedToExecute(*LI, DT, TheLoop)) {
+        LLVM_DEBUG(
+            dbgs() << "Early exit condition load not guaranteed to execute.\n");
+        reportVectorizationFailure(
+            "Early exit condition load not guaranteed to execute",
+            "Cannot vectorize early exit loop when condition load is not "
+            "guaranteed to execute",
+            "EarlyExitLoadNotGuaranteed", ORE, TheLoop);
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Early exit condition load potentially unsafe.\n");
+      reportVectorizationFailure("Uncounted loop condition not known safe",
+                                 "Cannot vectorize early exit loop with "
+                                 "possibly unsafe condition load",
+                                 "PotentiallyFaultingEarlyExitLoop", ORE,
+                                 TheLoop);
+      return false;
+    }
+  } else if (HasStore) {
+    LLVM_DEBUG(dbgs() << "Found early exit store but no condition load.\n");
     reportVectorizationFailure(
-        "Loop may fault",
-        "Cannot vectorize potentially faulting early exit loop",
-        "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+        "Early exit loop with store but no condition load",
+        "Cannot vectorize early exit loop with store but no condition load",
+        "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
     return false;
+  } else {
+    // Read-only loop.
+    // FIXME: as with the loops with stores, only the loads contributing to
+    //        the loop condition need to be guaranteed dereferenceable and
+    //        aligned.
+    if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
+                                       &Predicates)) {
+      reportVectorizationFailure(
+          "Loop may fault",
+          "Cannot vectorize potentially faulting early exit loop",
+          "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
+      return false;
+    }
   }
 
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
@@ -1751,6 +1853,11 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
                        "backedge taken count: "
                     << *SymbolicMaxBTC << '\n');
   UncountableEdge = SingleUncountableEdge;
+  if (HasStore) {
+    RequiresEarlyExitConditionCopy = true;
+    EarlyExitLoad = EELoad;
+  }
+
   return true;
 }
 
@@ -1823,6 +1930,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
     } else {
       if (!isVectorizableEarlyExitLoop()) {
         UncountableEdge = std::nullopt;
+        EarlyExitLoad = std::nullopt;
+        RequiresEarlyExitConditionCopy = false;
         if (DoExtraAnalysis)
           Result = false;
         else

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9246,6 +9246,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
         VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
                                  *Plan, CM.getMinimalBitwidths());
       VPlanTransforms::optimize(*Plan);
+
+      // See if we can convert an early exit vplan to bail out to a scalar
+      // loop if state-changing operations (like stores) are present and
+      // an exit will be taken in the next vector iteration.
+      // If not, discard the plan.
+      if (Legal->isConditionCopyRequired() && !HasScalarVF &&
+          !VPlanTransforms::runPass(VPlanTransforms::tryEarlyExitConversion,
+                                    *Plan))
+        break;
       // TODO: try to put it close to addActiveLaneMask().
       // Discard the plan if it is not EVL-compatible
       if (CM.foldTailWithEVL() && !HasScalarVF &&
@@ -9570,6 +9579,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           },
           Range);
   auto Plan = std::make_unique<VPlan>(OrigLoop);
+
+  // FIXME: Better place to put this? Or maybe an enum for how to handle
+  //        early exits?
+  if (Legal->hasUncountableEarlyExit())
+    Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired());
+
   // Build hierarchical CFG.
   // TODO: Convert to VPlan-transform and consolidate all transforms for VPlan
   // creation.
@@ -9876,6 +9891,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 
   // Create new empty VPlan
   auto Plan = std::make_unique<VPlan>(OrigLoop);
+
+  // FIXME: Better place to put this? Or maybe an enum for how to handle
+  //        early exits?
+  if (Legal->hasUncountableEarlyExit())
+    Plan->setEarlyExitContinuesInScalarLoop(Legal->isConditionCopyRequired());
+
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
   HCFGBuilder.buildPlainCFG();

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3522,6 +3522,13 @@ class VPlan {
   /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
 
+  /// Indicates that an early exit loop will exit before the condition is
+  /// reached, and that the scalar loop must perform the last few iterations.
+  /// FIXME: Is this the right place? We mainly want to make sure that we
+  ///        know about this for transforming the plan to copy&move the exit
+  ///        condition, but maybe it doesn't need to be in the plan itself.
+  bool EarlyExitContinuesInScalarLoop = false;
+
   /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader
   /// wrapping the original header of the scalar loop.
   VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader)
@@ -3825,6 +3832,16 @@ class VPlan {
     return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1;
   }
 
+  /// Returns true if all exit paths should reach the scalar loop.
+  bool shouldEarlyExitContinueInScalarLoop() const {
+    return EarlyExitContinuesInScalarLoop;
+  }
+
+  /// Set early exit vectorization to always reach the scalar loop.
+  void setEarlyExitContinuesInScalarLoop(bool Continues) {
+    EarlyExitContinuesInScalarLoop = Continues;
+  }
+
   /// Returns true if the scalar tail may execute after the vector loop. Note
   /// that this relies on unneeded branches to the scalar tail loop being
   /// removed.