diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index 43d61832cafdd..e875e2cdc9f66 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, /// constant trip count in non-trivial cases. std::optional getConstantTripCount(AffineForOp forOp); +/// Returns the maximum trip count when the operand of forOp has a range. If the +/// operand of forOp is a constant, the return value is the same as +/// `getConstantTripCount`. +std::optional getUpperBoundOnTripCount(AffineForOp forOp); + /// Returns the greatest known integral divisor of the trip count. Affine /// expression analysis is used (indirectly through getTripCount), and /// this method is thus able to determine non-trivial divisors. diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 01cc500148385..4815600e8fa54 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -18,6 +18,8 @@ #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h" #include "mlir/Dialect/Affine/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "llvm/Support/MathExtras.h" #include "llvm/ADT/DenseSet.h" @@ -212,32 +214,68 @@ void mlir::affine::getTripCountMapAndOperands( tripCountValueMap.getOperands().end()); } +/// The function make map be computed with the given operands to get the value +/// of trip, which may have a range when a range exists for either operand. +/// If type is equal to BoundType::LB get the minimum value of the trip, if type +/// is equal to BoundType::UB get the maximum value of the trip. +static std::optional +getKnownTripCountBound(AffineMap map, SmallVectorImpl &operands, + presburger::BoundType type) { + std::optional tripCount; + for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) { + AffineMap subMap = map.getSubMap(i); + ValueBoundsConstraintSet::Variable var(subMap, operands); + auto lbBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::LB, var); + auto ubBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::UB, var, nullptr, /*closedUB*/ true); + if (failed(lbBound) || failed(ubBound)) + return std::nullopt; + if (type == presburger::BoundType::LB) { + if (tripCount.has_value()) + tripCount = + std::min(*tripCount, static_cast(lbBound.value())); + else + tripCount = lbBound.value(); + } else if (type == presburger::BoundType::UB) { + if (tripCount.has_value()) + tripCount = + std::max(*tripCount, static_cast(ubBound.value())); + else + tripCount = ubBound.value(); + } else { + return std::nullopt; + } + } + return tripCount; +} + /// Returns the trip count of the loop if it's a constant, std::nullopt /// otherwise. This method uses affine expression analysis (in turn using /// getTripCount) and is able to determine constant trip count in non-trivial /// cases. std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { - SmallVector operands; + SmallVector operands; AffineMap map; getTripCountMapAndOperands(forOp, &map, &operands); if (!map) return std::nullopt; + return getKnownTripCountBound(map, operands, presburger::BoundType::LB); +} - // Take the min if all trip counts are constant. - std::optional tripCount; - for (auto resultExpr : map.getResults()) { - if (auto constExpr = dyn_cast(resultExpr)) { - if (tripCount.has_value()) - tripCount = - std::min(*tripCount, static_cast(constExpr.getValue())); - else - tripCount = constExpr.getValue(); - } else { - return std::nullopt; - } - } - return tripCount; +/// Returns the maximum trip count when the operand of forOp has a range. +/// If the operand of forOp is a constant, the return value is the same as +/// `getConstantTripCount`. +std::optional +mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) { + SmallVector operands; + AffineMap map; + getTripCountMapAndOperands(forOp, &map, &operands); + + if (!map) + return std::nullopt; + return getKnownTripCountBound(map, operands, presburger::BoundType::UB); } /// Returns the greatest known integral divisor of the trip count. Affine @@ -255,10 +293,14 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { // divisors. assert(map.getNumResults() >= 1 && "expected one or more results"); std::optional gcd; - for (auto resultExpr : map.getResults()) { + for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) { uint64_t thisGcd; - if (auto constExpr = dyn_cast(resultExpr)) { - uint64_t tripCount = constExpr.getValue(); + AffineMap subMap = map.getSubMap(i); + ValueBoundsConstraintSet::Variable var(subMap, operands); + auto lbBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::LB, var); + if (!failed(lbBound)) { + uint64_t tripCount = lbBound.value(); // 0 iteration loops (greatest divisor is 2^64 - 1). if (tripCount == 0) thisGcd = std::numeric_limits::max(); @@ -267,7 +309,8 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { thisGcd = tripCount; } else { // Trip count is not a known constant; return its largest known divisor. - thisGcd = resultExpr.getLargestKnownDivisor(); + thisGcd = map.getResult(i).getLargestKnownDivisor(); + ; } if (gcd.has_value()) gcd = std::gcd(*gcd, thisGcd); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 4aa1fe318efa8..e228181b80d55 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -116,8 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); - if (!tripCount || *tripCount != 1) + std::optional minTripCount = getConstantTripCount(forOp); + std::optional maxTripCount = getUpperBoundOnTripCount(forOp); + if (!minTripCount || *minTripCount != 1 || !maxTripCount || + *maxTripCount != 1) return failure(); // TODO: extend this for arbitrary affine bounds. @@ -885,15 +887,23 @@ void mlir::affine::getTileableBands( /// Unrolls this loop completely. LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { std::optional mayBeConstantTripCount = getConstantTripCount(forOp); - if (mayBeConstantTripCount.has_value()) { - uint64_t tripCount = *mayBeConstantTripCount; - if (tripCount == 0) - return success(); - if (tripCount == 1) - return promoteIfSingleIteration(forOp); - return loopUnrollByFactor(forOp, tripCount); - } - return failure(); + std::optional maxMayBeConstantTripCount = + getUpperBoundOnTripCount(forOp); + + if (!mayBeConstantTripCount.has_value() && + !maxMayBeConstantTripCount.has_value()) + return failure(); + + uint64_t tripCount = *mayBeConstantTripCount; + + // Trip equals 0, this loop cannot unroll. + if (tripCount <= 0) + return success(); + + if (succeeded(promoteIfSingleIteration(forOp))) + return success(); + + return loopUnrollByFactor(forOp, tripCount); } /// Unrolls this loop by the specified factor or by the trip count (if constant) @@ -1014,8 +1024,12 @@ LogicalResult mlir::affine::loopUnrollByFactor( assert(unrollFactor > 0 && "unroll factor should be positive"); std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional maxMayBeConstantTripCount = + getUpperBoundOnTripCount(forOp); if (unrollFactor == 1) { - if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp))) + if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && + maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 && + failed(promoteIfSingleIteration(forOp))) return failure(); return success(); } @@ -1035,7 +1049,10 @@ LogicalResult mlir::affine::loopUnrollByFactor( } // Generate the cleanup loop if trip count isn't a multiple of unrollFactor. - if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { + // If the trip count has a range, a clean up loop needs to be generated. + if ((mayBeConstantTripCount && maxMayBeConstantTripCount && + *mayBeConstantTripCount != *maxMayBeConstantTripCount) || + getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { // Loops where the lower bound is a max expression or the upper bound is // a min expression and the trip count doesn't divide the unroll factor // can't be unrolled since the lower bound of the cleanup loop in such cases diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp index c9481fb5d9406..8dae4b09cf4d2 100644 --- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp +++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp @@ -646,7 +646,8 @@ FailureOr ValueBoundsConstraintSet::computeConstantBound( // Compute constant bound for `valueDim`. int64_t ubAdjustment = closedUB ? 0 : 1; if (auto bound = cstr.cstr.getConstantBound64(type, pos)) - return type == BoundType::UB ? *bound + ubAdjustment : *bound; + if (bound.has_value()) + return type == BoundType::UB ? *bound + ubAdjustment : *bound; return failure(); } diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 574e9f41494af..24df89bf8a76e 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -12,6 +12,7 @@ // UNROLL-FULL-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)> // UNROLL-FULL-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)> +// UNROLL-FULL-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 9) ceildiv 2) floordiv 4) * 8)> // SHORT-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> @@ -22,7 +23,8 @@ // UNROLL-BY-4-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)> // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)> // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)> -// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)> +// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<(d0) -> (d0)> +// UNROLL-BY-4-DAG: [[$MAP8:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)> // UNROLL-FULL-LABEL: func @loop_nest_simplest() { func.func @loop_nest_simplest() { @@ -258,6 +260,72 @@ gpu.module @unroll_full { } } +// UNROLL-FULL-LABEL: func @thread_partial_execution +func.func @thread_partial_execution() { + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { + affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index + } + // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index + // UNROLL-FULL-NEXT: } + gpu.terminator + } + return +} + +// UNROLL-FULL-LABEL: func @unroll_all_thread +func.func @unroll_all_thread() { + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { + %threadid = gpu.thread_id x + affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index + } + // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + gpu.terminator + } + return +} + +// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 +func.func @partial_unroll_factor_4() { + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { + %threadid = gpu.thread_id x + affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index + } + gpu.terminator + } + // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x + // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM_4]] : index + // UNROLL-FULL-NEXT: } + return +} + // SHORT-LABEL: func @loop_nest_outer_unroll() { func.func @loop_nest_outer_unroll() { // SHORT: affine.for %arg0 = 0 to 4 { @@ -470,7 +538,7 @@ func.func @loop_nest_operand1() { // UNROLL-BY-4-LABEL: func @loop_nest_operand2() { func.func @loop_nest_operand2() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 { -// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { +// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32 @@ -516,7 +584,7 @@ func.func @floordiv_mod_ub(%M : index, %N : index) { func.func @loop_nest_operand3() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 { affine.for %i = 0 to 100 step 2 { - // UNROLL-BY-4: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { + // UNROLL-BY-4: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32 @@ -701,6 +769,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32 return %sum : f32 } +// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4 +func.func @gpu_launch_unroll_by_factor_4() { + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index + // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { + %threadid = gpu.thread_id x + affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index + } + gpu.terminator + } + // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x + // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index + // UNROLL-BY-4-NEXT: } + return +} + // UNROLL-FULL: func @unroll_zero_trip_count_case func.func @unroll_zero_trip_count_case() { // CHECK-NEXT: affine.for %{{.*}} = 0 to 0