[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

linuxlonelyeagle · 2025-02-21T02:48:14Z

I think this is a great improvement, it contains the following content.

Added logic for dynamic value inference
Added gpu unroll tests

llvmbot · 2025-02-21T02:48:45Z

@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir-affine

Author: lonely eagle (linuxlonelyeagle)

Changes

I think this is a great improvement, it contains the following content.

Added support for GPU unroll
Although Thread_id Op is a dynamic Value, the trip of loops can be determined by its range.Even if the threads are divided, affineMap only performs mathematical calculations on thread_id. Further calculations on it can actually be regarded as calculations made on thread_id in a loop.
Added logic for dynamic value inference
This PR only contains the inference of thread_id,but the same applies to other dynamic values.
Removed invalid loop (minor changes

Full diff: https://github.com/llvm/llvm-project/pull/128113.diff

7 Files Affected:

(modified) mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h (+4)
(modified) mlir/include/mlir/Dialect/Affine/LoopUtils.h (+3)
(modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+6)
(modified) mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp (+96-14)
(modified) mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp (+46-11)
(modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+20)
(modified) mlir/test/Dialect/Affine/unroll.mlir (+110)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index ed3c21d952a01..2bd540b9af2eb 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..1d1d6d94d2382 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
 /// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
 void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2b1ce573effd0..940d47c5ef2c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Find BlockSize via the BlockArgument of gpu.launch.
+    Value getBlockSizeOnAxis(Value threadId);
+
+    ///  Find BlockSize via the Dimension Information.
+    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 0d4b0ea1668e0..15a5376fa922e 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+                               SmallVectorImpl<Value> &operands,
+                               SmallVectorImpl<AffineExpr> &symReplacements,
+                               unsigned numDim, bool replaceWithZero = false) {
+  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+  if (!launchOp)
+    return;
+
+  // `b` is only used to create `AffineExpr`.
+  Builder b(forOp.getContext());
+  unsigned idx = 0;
+
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+      operands[i] = blockSize;
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) {
+      ++idx;
+      continue;
+    }
+
+    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+      gpu::Dimension dimension = threadIdOp.getDimension();
+      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+    ++idx;
+  }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a08949..69ceb0f80095b 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
 }
 
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+    return failure();
+
+  auto iterOperands = forOp.getInits();
+  auto results = forOp.getResults();
+  for (auto [result, operand] : llvm::zip(results, iterOperands))
+    result.replaceAllUsesWith(operand);
+
+  IRRewriter b(forOp);
+  b.eraseOp(forOp);
+  return success();
+}
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // The values of Trip are all 0, and the invalid loop is deleted.
+  if (tripCount <= 0 && maxTripCount <= 0)
+    return removeInvalidLoop(forOp);
+
+  // In special cases, such as in a GPU, only some threads execute this loop.
+  if (tripCount == 0 && maxTripCount == 1)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d06f10d3137a1..31051ed7e55a2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+  if (dimension == Dimension::x)
+    return getBlockSizeX();
+  else if (dimension == Dimension::y)
+    return getBlockSizeY();
+  else
+    return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+  KernelDim3 threadIds = getThreadIds();
+  if (threadIds.x == threadId)
+    return getBlockSizeX();
+  else if (threadIds.y == threadId)
+    return getBlockSizeY();
+  else if (threadIds.z == threadId)
+    return getBlockSizeZ();
+  return {};
+}
+
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..a2bb0b2cac4e3 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,89 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @invalid_loop
+func.func @invalid_loop() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+    // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id  x
+    // UNROLL-FULL-CHECK: gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL:   func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL:           %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0

llvmbot · 2025-02-21T02:48:46Z

@llvm/pr-subscribers-mlir

Author: lonely eagle (linuxlonelyeagle)

Changes

I think this is a great improvement, it contains the following content.

Added support for GPU unroll
Although Thread_id Op is a dynamic Value, the trip of loops can be determined by its range.Even if the threads are divided, affineMap only performs mathematical calculations on thread_id. Further calculations on it can actually be regarded as calculations made on thread_id in a loop.
Added logic for dynamic value inference
This PR only contains the inference of thread_id,but the same applies to other dynamic values.
Removed invalid loop (minor changes

Full diff: https://github.com/llvm/llvm-project/pull/128113.diff

7 Files Affected:

(modified) mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h (+4)
(modified) mlir/include/mlir/Dialect/Affine/LoopUtils.h (+3)
(modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+6)
(modified) mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp (+96-14)
(modified) mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp (+46-11)
(modified) mlir/lib/Dialect/GPU/IR/GPUDialect.cpp (+20)
(modified) mlir/test/Dialect/Affine/unroll.mlir (+110)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index ed3c21d952a01..2bd540b9af2eb 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..1d1d6d94d2382 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
 /// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
 void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2b1ce573effd0..940d47c5ef2c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Find BlockSize via the BlockArgument of gpu.launch.
+    Value getBlockSizeOnAxis(Value threadId);
+
+    ///  Find BlockSize via the Dimension Information.
+    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 0d4b0ea1668e0..15a5376fa922e 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+                               SmallVectorImpl<Value> &operands,
+                               SmallVectorImpl<AffineExpr> &symReplacements,
+                               unsigned numDim, bool replaceWithZero = false) {
+  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+  if (!launchOp)
+    return;
+
+  // `b` is only used to create `AffineExpr`.
+  Builder b(forOp.getContext());
+  unsigned idx = 0;
+
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+      operands[i] = blockSize;
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) {
+      ++idx;
+      continue;
+    }
+
+    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+      gpu::Dimension dimension = threadIdOp.getDimension();
+      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+    ++idx;
+  }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a08949..69ceb0f80095b 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
 }
 
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+    return failure();
+
+  auto iterOperands = forOp.getInits();
+  auto results = forOp.getResults();
+  for (auto [result, operand] : llvm::zip(results, iterOperands))
+    result.replaceAllUsesWith(operand);
+
+  IRRewriter b(forOp);
+  b.eraseOp(forOp);
+  return success();
+}
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // The values of Trip are all 0, and the invalid loop is deleted.
+  if (tripCount <= 0 && maxTripCount <= 0)
+    return removeInvalidLoop(forOp);
+
+  // In special cases, such as in a GPU, only some threads execute this loop.
+  if (tripCount == 0 && maxTripCount == 1)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d06f10d3137a1..31051ed7e55a2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+  if (dimension == Dimension::x)
+    return getBlockSizeX();
+  else if (dimension == Dimension::y)
+    return getBlockSizeY();
+  else
+    return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+  KernelDim3 threadIds = getThreadIds();
+  if (threadIds.x == threadId)
+    return getBlockSizeX();
+  else if (threadIds.y == threadId)
+    return getBlockSizeY();
+  else if (threadIds.z == threadId)
+    return getBlockSizeZ();
+  return {};
+}
+
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..a2bb0b2cac4e3 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,89 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @invalid_loop
+func.func @invalid_loop() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+    // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id  x
+    // UNROLL-FULL-CHECK: gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL:   func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL:           %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0

linuxlonelyeagle · 2025-02-21T02:51:06Z

Ping @bondhugula I think this PR needs you, thank you.

krzysz00

I have concerns about this.

I'm a bit dubious about how this special-case for fetching GPU ID bounds works in general.

I'm extremely dubious about this getting hooked up to the loop unroller, of all things.

If what you're trying to do is to remove loops that're guaranteed to run exactly once, I'd start by trying to upstream the code in, say, https://github.com/iree-org/iree/blob/26a89c30329b18efd5ac29aff258402b481ea9e4/compiler/src/iree/compiler/Codegen/Transforms/RemoveSingleIterationLoop.cpp - which is a pass that uses the affine value bounds analysis to do exactly this

Especially since gpu.launch now implemnets ValueBoundsOpInterface

mlir/test/Dialect/Affine/unroll.mlir

bondhugula

While it's useful to have a getUpperBoundOnTripCount, this approach to specially handle GPU thread IDs in an affine utility isn't the proper, reusable, or the generic way. Instead, see the integer range analysis framework that's already available. It's more general and can be extended as needed.

mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h

mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

bondhugula · 2025-02-24T04:21:41Z

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)


If the trip count is known to be one, how can the max trip count be anything other than one?!

Maybe maxTripCount will be equal to 2.

Why would getConstantMaxTripCount return a value different from the constant trip count when the trip count is known to be so? It shouldn't - otherwise, it's trivially loose.

You are obviously talking about the CPU, which is indeed constant, but for hardware like GPU, threadId is a dynamic thing. The smallest threadid is 0, and the largest threadid is blocksize -1. The value of (upper - thread) / step is obviously not constant.

Could you please look at the comments below, I'm wondering if affine-loop-unroll is not a pattern pass causing this issue (if you have the time. I'll continue to work on it.

If it can run it will definitely be a huge improvement, it's really exciting.

A lot of this confusion would be cleared up if tripCount were minTripCount

I'm not sure it would. The semantics of affine loops is to take a minimum of values produced by each individual expression in the upper bound, it's unclear to me why we would need to reason about the upper bound.

linuxlonelyeagle · 2025-02-25T07:50:32Z

Ping @bondhugula @krzysz00 Looks like I need your help.
Describe what the situation is now.

    // lattice belongs to %tx (thread_id x in launchOp) it return true.So unroll failed.
    if (lattice->getValue().isUninitialized()) {
      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
      continue;
    }

I analyzed the following reasons, Although the analysis implements the following Ops, it does not actually run into any of them, because mlir-opt -debug will output which Ops were analyzed.

  /// Visit an operation. Invoke the transfer function on each operation that
  /// implements `InferIntRangeInterface`.
  LogicalResult
  visitOperation(Operation *op,
                 ArrayRef<const IntegerValueRangeLattice *> operands,
                 ArrayRef<IntegerValueRangeLattice *> results) override;

  /// Visit block arguments or operation results of an operation with region
  /// control-flow for which values are not defined by region control-flow. This
  /// function calls `InferIntRangeInterface` to provide values for block
  /// arguments or tries to reduce the range on loop induction variables with
  /// known bounds.
  void
  visitNonControlFlowArguments(Operation *op, const RegionSuccessor &successor,
                               ArrayRef<IntegerValueRangeLattice *> argLattices,
                               unsigned firstIndex) override;

I have extended LaunchOp::inferResultRanges. Considering the particularity of the launchOp structure, I have no idea whether I should implement other visit functions. This is my first time using the analysis method, and there are indeed some places I don’t understand.Hope you guys can give me some advice (debugging advice would be great too.

linuxlonelyeagle · 2025-02-25T10:09:28Z

I found that int-range-optimizations can successfully run the extension I made, maybe I can find the problem from it.

linuxlonelyeagle · 2025-02-25T11:47:25Z

I feel like this question is very abstract.Is there something wrong with the lattice I got this way?It works just fine on the existing tests.I'm wondering if there's a potential problem here?Thanks.

  solver.load<dataflow::IntegerRangeAnalysis>();
  if (failed(solver.initializeAndRun(
          forOp->getParentOfType<FunctionOpInterface>())))
    return;
  LLVM_DEBUG(llvm::dbgs() << "after init and run");
  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
    Value operand = operands[i];
    auto lattice =
        solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);

linuxlonelyeagle · 2025-02-27T01:39:41Z

Ping @bondhugula @krzysz00 Can we speed up the review of this PR?

krzysz00 · 2025-02-27T22:56:53Z

Also, for affine, I think you want ValueBoundsOpInterface and the like, not IntegerRangeInference

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp

krzysz00 · 2025-02-27T23:03:09Z

(Re the lattice, you need to have DeadCodeAnalysis loaded)

But also look into ValueBoundsConstraintSet - it works better with affine

linuxlonelyeagle · 2025-03-03T01:29:09Z

I implemented it using valueBoundInterface, maybe we can continue reviewing this PR.

linuxlonelyeagle · 2025-03-06T01:39:24Z

@bondhugula @krzysz00 I'm a little concerned (since I'll be submitting other PRs) that we can continue to review this PR?

linuxlonelyeagle · 2025-03-06T03:38:58Z

Maybe you are worried whether this is a safe enough method. However, in my opinion, it is indeed a safe enough method.
The previous implementation replaced the operands with their minimum and maximum values.But I found a potential problem, if there are multiple operands in affineMap, the result may not be accurate. But ValueBoundInterface can get the maximum and minimum values of the whole map result, I think it should be safe and correct enough.

One more thing is, my program runs just fine using this pass.
So I think the changes in this PR are a safe and reliable enough solution.

linuxlonelyeagle · 2025-03-16T06:58:09Z

Ping for review @bondhugula @krzysz00 This PR has been around a long time,would love for it to continue. Thanks all.

krzysz00

To give an overall shape of my thoughts, this PR makes sense but its name and PR description need to be clarified.

That is, the nature of the change is "[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis"

mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

krzysz00 · 2025-03-17T21:31:41Z

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)


A lot of this confusion would be cleared up if tripCount were minTripCount

…inTripCount.

linuxlonelyeagle · 2025-03-27T02:40:16Z

@bondhugula Ping for review.If there is anything about this PR that confuses you, further communication is welcome.

linuxlonelyeagle · 2025-04-07T15:39:54Z

@bondhugula ping for review.

linuxlonelyeagle · 2025-04-07T15:57:59Z

I'm very sorry for pinging you multiple times, but both @krzysz00 and I think this PR makes sense.

ftynse

I don't understand the need for min/max bound logic here. computeConstantBound allows one to compute the "equality" bound, e.g., whether the value is known to always be equal. That would be a direct equivalent of the existing logic using a more advanced analysis.

mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

ftynse · 2025-04-08T08:36:05Z

mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

-      return std::nullopt;
-  }
-  return tripCount;
+/// Returns the maximum trip count when the operand of forOp has a range. If the


So what this does internally is compute an upper bound on each expression "potential upper bound - single lower bound" and take a minimum of that. Can you provide a mathematical justification as to why this provides a correct (and tight?) upper bound?

mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

ftynse · 2025-04-08T16:14:45Z

mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp

+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)


I'm not sure it would. The semantics of affine loops is to take a minimum of values produced by each individual expression in the upper bound, it's unclear to me why we would need to reason about the upper bound.

ftynse · 2025-04-08T16:16:30Z

mlir/test/Dialect/Affine/unroll.mlir

+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)


Could we avoid using GPU dialect operations here? I suppose we have tests for the bound analysis somewhere that must be using test ops with known bounds, we could use those instead and not spuriously rely on the logic of another dialect here.

krzysz00

I sadly must resign as a substantive reviewer - I don't know this code well enough to have opinions on it

mlir/test/Dialect/Affine/unroll.mlir

linuxlonelyeagle · 2025-04-09T07:01:07Z

It's been a while since I've looked at this pass and there are some details I seem to have found that I need to confirm. There are some comments I didn't mark they resolved. I'll answer them later on in unison.

linuxlonelyeagle · 2025-04-09T13:22:25Z

@krzysz00 I think you can still give ideas for improvement because you know the GPU very well.Based on comments left by @ftynse .There are no resolve comments above. I think there are questions with consistent answers, so I'll make a unified answer here.

I don't understand the need for min/max bound logic here.
it's unclear to me why we would need to reason about the upper bound.
maxTripCount is always greater than or equal to TriCount. In the case of the CPU, they are equal.The original commit included the removal of the invalid loop, which has now been removed.

std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);

Keep the loop in this case.
tripCount = 0
maxTripCount = 1
Keep the loop in this case.
And the other is the case of dumping the IR in the loop out of the loop.
tripCount = 1
maxTripCount = 1
General case
Dump the IR in the loop and then keep the loop once, which is equivalent to an if statement for controlling boundaries, since only some threads will execute the code in the loop.
tripCount > 1
core idea
tripCount = (upper - (blockSize - 1)) div stride
maxTripCount = (uppper - 0) div stride
(blockSize - 1 ) = maxThreadId = blockSIze - 1
0 = minThreadId = 0
The above rules apply to all scoped Values.
The role of min bound
min bound is used to determine the size of the unroll factor.
The role of max bound
max bound is used to determine whether to keep the last loop (which is equivalent to an if statement).

whether the value is known to always be equal
The following cases are equivalent.

// thread size = 2, min thread id = 0, max thread id = 1
%thread_id = gpu.thread_id x
affine.for %iv = %thread_id to 2 step 2 {
  // use %iv
}
// div ceil
// max trip = (2 - 0) / 2 = 1
// min trip = (2 -1 ) / 2 = 1

Can you provide a mathematical justification as to why this provides a correct (and tight?) upper bound?
I may not be able to provide a mathematical formula.But for max trip, it is true that std::min should not be used, std::max should be used.

Suppose there is a loop here, but affineMap has two results. They are in the ranges [4, 6] and [2, 3], and before they should have resulted in [2, 3], which is clearly not true, and now the result is [2, 6]. It shows that the unroll trip should be 2, and 6 shows the need to keep a loop that controls the boundary.

Could we avoid using GPU dialect operations here?
I still think I should keep the current test as it encompasses my usage, and the goal of this PR in the beginning was designed for GPUs as well.

Further discussion is welcome.

linuxlonelyeagle · 2025-04-13T07:19:50Z

Can you guys tell me what you think?

ftynse · 2025-04-14T09:22:28Z

tripCount = (upper - (blockSize - 1)) div stride
maxTripCount = (uppper - 0) div stride
(blockSize - 1 ) = maxThreadId = blockSIze - 1
0 = minThreadId = 0

Affine loop transformations shouldn't care about block sizes or thread IDs. They should only reason about ranges of the induction variables, regardless of where those ranges come from.

max bound is used to determine whether to keep the last loop (which is equivalent to an if statement).

It appears to me that the loop must be preserved whenever the upper bound is not equal to the lower bound, which is equivalent to saying we cannot find the single constant bound (min=max=constant). This is exactly what we already do, just using less powerful reasoning.

Suppose there is a loop here, but affineMap has two results. They are in the ranges [4, 6] and [2, 3], and before they should have resulted in [2, 3], which is clearly not true, and now the result is [2, 6]. It shows that the unroll trip should be 2, and 6 shows the need to keep a loop that controls the boundary.

The upper bound of the affine loop is systematically a minimum of all the expressions. So if you two expressions with ranges [4, 6] and [2, 3] (non-overlapping), the expressions with [4, 6] will never affect the bound. We can even remove it as an optimization.

This is why it's important to understand the mathematics behind these computations...

linuxlonelyeagle · 2025-04-14T09:37:02Z

Affine loop transformations shouldn't care about block sizes or thread IDs. They should only reason about ranges of the induction variables, regardless of where those ranges come from.

yes, you are right.I'm only using an example because I can only think of the gpu situation because I touch the thing every day. But the work done by this PR wouldn't be limited to GPUs only, if a Value has RANGE, this PR works too.

It appears to me that the loop must be preserved whenever the upper bound is not equal to the lower bound, which is equivalent to saying we cannot find the single constant bound (min=max=constant). This is exactly what we already do, just using less powerful reasoning.

you are right.

The upper bound of the affine loop is systematically a minimum of all the expressions. So if you two expressions with ranges [4, 6] and [2, 3] (non-overlapping), the expressions with [4, 6] will never affect the bound. We can even remove it as an optimization.

Here you should think wrong. Although they don't overlap, the lower bounds are 2 and 4 respectively, which means that the maximum unroll factor a loop can have is 2, and the other is 4, and they can unroll together twice.

linuxlonelyeagle · 2025-04-14T09:38:54Z

I appreciate your patient thinking（But there really is some fun behind this principle.

krzysz00 · 2025-04-14T16:21:16Z

Re the [2, 3] and [4, 6] example, what I think @ftynse is saying is that, by definition of affine.for, the upper bound will always be the [2, 3] value, and the [4, 6] value can be ignored.

Now, to give the proposed optimization without GPU context

Let 0 <= x < y - we don't know what x is, but it's in [0, y)

Then, the loop

affine.for %arg0 = %x to %y step %y {
  [body]
}

is guaranteed to run exactly once and so can be unrolled to just body.

Usually, %y here is a constant - in the context of this GPU stuff, it'll be something like "number of threads in a block"

In the unrolling case, this generalizes to

// 0 <= %r < %y
%b = %y * N + %r
affine.for %arg0 = %x to %r step %y {
  ...
}

being unrollable N times.

Note similarly, that if we have 0 <= %z < %y, we can optimize

affine.for %arg0 = %x to %z step %y {
  ...
}

to

affine.if %x < %z {
  ...
}

linuxlonelyeagle · 2025-04-15T02:42:46Z

I already understand what @ftynse is saying.What should be meant here is if the two expressions range from [4, 6] and [2,3]. We can determine whether they overlap or not, and if they don't, then just take the range with the smallest lower bound and just discard [4, 6]. @krzysz00 Here should be a point of optimization.

llvm-project/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp

Line 296 in e58e115

for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) {

Usually, %y here is a constant - in the context of this GPU stuff, it'll be something like "number of threads in a block"
In the unrolling case, this generalizes to

What you mention here should be in addition to this PR (because if I remember correctly, there were TODOs that said something like this should be accomplished.

krzysz00 · 2025-05-02T23:49:50Z

Looking over the PR again, I think there's a different phrasing of the proposed improvement that'll behave better but might require some comments and/or refactoring.

Suppose we have a loop where the lower bounds are L = {l1, l2, ... lM}, and the upper bounds are U = {u1, u2, ..., uN} and step S . This means that the trip count T is bounded by min(U) - max(L)) ceildiv S <= T <= (max(U) - min(L)) ceildiv S.

Now, for non-constant l_i and u_i, we can sometimes use the value bounds analysis to determine their minima or maxima, and use those in those min/max bounds. That is, if I have %x such that I know 0 <= %x <= 63, than max({x}) would be 63 and min({x}) would be 0.

So, a transformation I'm convinced of - and one that maybe should be in loop normalization, it's hard to tell, is noting that if the bounds on the trip count are 1 <= T <= 1, the loop is trivial, and if we have 0 <= T <= 1, then this loop is just an affine.if min(U) > max(L).

I think where unrolling comes in is that if T >= n * F for some unrolling factor F, you can unroll n iterations of the loop, and then your lower bound becomes {l1, 2, ..., ln} - n * F * S and your upper bound becomes {u1, u2, ..., uN} - n * F * S.

Please poke me if either my math's wrong or I haven't summarized the proposed change correctly

(This got lost in my tabs, apologies for the late response. I think there's something here, it's just an area where it's important to be rather careful with the details)

linuxlonelyeagle · 2025-05-05T14:57:52Z

Looking over the PR again, I think there's a different phrasing of the proposed improvement that'll behave better but might require some comments and/or refactoring.

Suppose we have a loop where the lower bounds are L = {l1, l2, ... lM}, and the upper bounds are U = {u1, u2, ..., uN} and step S . This means that the trip count T is bounded by min(U) - max(L)) ceildiv S <= T <= (max(U) - min(L)) ceildiv S.

Now, for non-constant l_i and u_i, we can sometimes use the value bounds analysis to determine their minima or maxima, and use those in those min/max bounds. That is, if I have %x such that I know 0 <= %x <= 63, than max({x}) would be 63 and min({x}) would be 0.

So, a transformation I'm convinced of - and one that maybe should be in loop normalization, it's hard to tell, is noting that if the bounds on the trip count are 1 <= T <= 1, the loop is trivial, and if we have 0 <= T <= 1, then this loop is just an affine.if min(U) > max(L).

I think where unrolling comes in is that if T >= n * F for some unrolling factor F, you can unroll n iterations of the loop, and then your lower bound becomes {l1, 2, ..., ln} - n * F * S and your upper bound becomes {u1, u2, ..., uN} - n * F * S.

Please poke me if either my math's wrong or I haven't summarized the proposed change correctly

(This got lost in my tabs, apologies for the late response. I think there's something here, it's just an area where it's important to be rather careful with the details)

I have something very important to do these days. When I'm done with them, I'll move forward on it.Thank you.

linuxlonelyeagle · 2025-06-24T15:57:43Z

Suppose we have a loop where the lower bounds are L = {l1, l2, ... lM}, and the upper bounds are U = {u1, u2, ..., uN} and step S . This means that the trip count T is bounded by min(U) - max(L)) ceildiv S <= T <= (max(U) - min(L)) ceildiv S.

I'm a little confused about that.You can read what I was talking about earlier with @ftynse.#128113 (comment) and #128113 (comment)

I think where unrolling comes in is that if T >= n * F for some unrolling factor F, you can unroll n iterations of the loop, and then your lower bound becomes {l1, 2, ..., ln} - n * F * S and your upper bound becomes {u1, u2, ..., uN} - n * F * S.

I totally understand what's going on here.But the current practice is similar to this one, but with some differences.Two parts are included.

// unroll out of affine.for
affine.for xxx {

}

// Remaining affine.for, equivalent to affine.if, 
// lower bound becomes {l1, 2, ..., ln} +  n * F * S, upper bound becomes {u1, u2, ..., uN} .
affine.for xxx {
}

@krzysz00 @ftynse I'm sorry to have kept you waiting so long.Please let me know further improvement suggestions for this PR. Thank you all.

support unroll by the gpu.launchOp.

23b3a7f

linuxlonelyeagle requested review from ftynse and Groverkss February 21, 2025 02:48

llvmbot added mlir:gpu mlir:affine mlir labels Feb 21, 2025

linuxlonelyeagle requested a review from bondhugula February 21, 2025 02:48

linuxlonelyeagle requested review from krzysz00, grypp and joker-eph February 21, 2025 02:49

krzysz00 requested changes Feb 21, 2025

View reviewed changes

mlir/test/Dialect/Affine/unroll.mlir Outdated Show resolved Hide resolved

linuxlonelyeagle force-pushed the gpu-launch-unroll branch from 4eb293f to 361d9cc Compare February 22, 2025 09:57

delete the feature of remove invalid loops.

c834f4d

linuxlonelyeagle force-pushed the gpu-launch-unroll branch from 361d9cc to c834f4d Compare February 22, 2025 10:08

bondhugula reviewed Feb 24, 2025

View reviewed changes

mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h Outdated Show resolved Hide resolved

bondhugula requested changes Feb 24, 2025

View reviewed changes

use IntegerRangeAnalysis and update launchOp::inferResultRanges.

e865351

linuxlonelyeagle force-pushed the gpu-launch-unroll branch from c4beb2f to e865351 Compare February 26, 2025 02:59

krzysz00 reviewed Feb 27, 2025

View reviewed changes

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp Outdated Show resolved Hide resolved

use ValueBoundsOpInterface.

0b30c4e

Merge branch 'main' into gpu-launch-unroll

fa68fe1

krzysz00 reviewed Mar 17, 2025

View reviewed changes

linuxlonelyeagle changed the title ~~[mlir][affine][gpu] support unroll dynamic value and add gpu unroll test.~~ [mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis Mar 18, 2025

update getKnownTripCountBound function name and rename tripCount to m…

82e48ee

…inTripCount.

ftynse requested review from krzysz00 and bondhugula April 8, 2025 07:56

ftynse requested changes Apr 8, 2025

View reviewed changes

krzysz00 reviewed Apr 8, 2025

View reviewed changes

mlir/test/Dialect/Affine/unroll.mlir Outdated Show resolved Hide resolved

improve doc and nit.

e31ff46

linuxlonelyeagle force-pushed the gpu-launch-unroll branch from 0aafe71 to e31ff46 Compare April 9, 2025 03:58

fix test.

e58e115

merge main.

372926c

[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

Are you sure you want to change the base?

[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

Uh oh!

Conversation

linuxlonelyeagle commented Feb 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 21, 2025

Uh oh!

linuxlonelyeagle commented Feb 21, 2025

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

bondhugula left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

linuxlonelyeagle commented Feb 25, 2025

Uh oh!

linuxlonelyeagle commented Feb 25, 2025

Uh oh!

linuxlonelyeagle commented Feb 25, 2025

Uh oh!

linuxlonelyeagle commented Feb 27, 2025

Uh oh!

krzysz00 commented Feb 27, 2025

Uh oh!

Uh oh!

krzysz00 commented Feb 27, 2025

Uh oh!

linuxlonelyeagle commented Mar 3, 2025

Uh oh!

linuxlonelyeagle commented Mar 6, 2025

Uh oh!

linuxlonelyeagle commented Mar 6, 2025

Uh oh!

linuxlonelyeagle commented Mar 16, 2025

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

linuxlonelyeagle commented Mar 27, 2025

Uh oh!

linuxlonelyeagle commented Apr 7, 2025

Uh oh!

linuxlonelyeagle commented Apr 7, 2025

Uh oh!

ftynse left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

linuxlonelyeagle commented Feb 21, 2025 •

edited

Loading

llvmbot commented Feb 21, 2025 •

edited

Loading

linuxlonelyeagle commented Apr 9, 2025 •

edited

Loading

linuxlonelyeagle commented Apr 9, 2025 •

edited

Loading

linuxlonelyeagle commented Apr 14, 2025 •

edited

Loading