-
Notifications
You must be signed in to change notification settings - Fork 13.4k
[LoopInterchange] Bail out early if minimum loop nest is not met #115128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Madhur Amilkanthwar (madhur13490) ChangesThis patch bails out early if minimum depth As the patch avoids unnecessary computation, it is aimed to improve compile-time. Full diff: https://github.com/llvm/llvm-project/pull/115128.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index db63bda1e6b926..09bd7d8eda8547 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -234,6 +234,14 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) {
LoopList.push_back(CurrentLoop);
}
+static bool hasMinimumLoopDepth(SmallVectorImpl<Loop *> &LoopList) {
+ unsigned LoopNestDepth = LoopList.size();
+ if (LoopNestDepth < 2) {
+ LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ return false;
+ }
+ return true;
+}
namespace {
/// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -416,11 +424,11 @@ struct LoopInterchange {
bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
bool Changed = false;
- unsigned LoopNestDepth = LoopList.size();
- if (LoopNestDepth < 2) {
- LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+
+ if (!hasMinimumLoopDepth(LoopList))
return false;
- }
+
+ unsigned LoopNestDepth = LoopList.size();
if (LoopNestDepth > MaxLoopNestDepth) {
LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
<< MaxLoopNestDepth << "\n");
@@ -1713,6 +1721,12 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
LPMUpdater &U) {
Function &F = *LN.getParent();
+ SmallVector<Loop *, 8> LoopList(LN.getLoops());
+
+ // Ensure minimum depth of the loop nest to do the interchange.
+ if (!hasMinimumLoopDepth(LoopList))
+ return PreservedAnalyses::all();
+
DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
std::unique_ptr<CacheCost> CC =
CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
diff --git a/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll
new file mode 100644
index 00000000000000..4a532e1a862e60
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/bail-out-one-loop.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+
+; RUN: opt < %s -passes=loop-interchange -debug -disable-output | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+
+@N = dso_local global i32 0, align 4
+@a = dso_local global ptr null, align 8
+@b = dso_local global ptr null, align 8
+@c = dso_local global ptr null, align 8
+
+; Loop interchange should not run delinearization
+; for one loop case and should bail out early.
+
+; CHECK-NOT: Delinearizing
+; CHECK-NOT: Strides:
+; CHECK-NOT: Terms:
+; CHECK: Loop doesn't contain minimum nesting level.
+
+define void @foo() {
+entry:
+ %retval = alloca i32, align 4
+ %i = alloca i32, align 4
+ store i32 0, ptr %retval, align 4
+ store i32 0, ptr %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %0 = load i32, ptr %i, align 4
+ %1 = load i32, ptr @N, align 4
+ %cmp = icmp ult i32 %0, %1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond
+ br label %for.end
+
+for.body: ; preds = %for.cond
+ %2 = load ptr, ptr @b, align 8
+ %3 = load i32, ptr %i, align 4
+ %idxprom = zext i32 %3 to i64
+ %arrayidx = getelementptr inbounds nuw i32, ptr %2, i64 %idxprom
+ %4 = load i32, ptr %arrayidx, align 4
+ %5 = load ptr, ptr @c, align 8
+ %6 = load i32, ptr %i, align 4
+ %idxprom1 = zext i32 %6 to i64
+ %arrayidx2 = getelementptr inbounds nuw i32, ptr %5, i64 %idxprom1
+ %7 = load i32, ptr %arrayidx2, align 4
+ %add = add nsw i32 %4, %7
+ %8 = load ptr, ptr @a, align 8
+ %9 = load i32, ptr %i, align 4
+ %idxprom3 = zext i32 %9 to i64
+ %arrayidx4 = getelementptr inbounds nuw i32, ptr %8, i64 %idxprom3
+ store i32 %add, ptr %arrayidx4, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %10 = load i32, ptr %i, align 4
+ %inc = add i32 %10, 1
+ store i32 %inc, ptr %i, align 4
+ br label %for.cond
+
+for.end: ; preds = %for.cond.cleanup
+ ret void
+}
+
|
You can test this locally with the following command:git-clang-format --diff 804d3c4ce192391ef7ba8724c6b9eff456b5c4b2 38ceb083dbba69aa66dc5e401da80f5c34ed461b --extensions cpp -- llvm/lib/Transforms/Scalar/LoopInterchange.cpp View the diff from clang-format here.diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index a8c3d61030..c774919e5b 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -426,7 +426,8 @@ struct LoopInterchange {
bool Changed = false;
// Ensure minimum loop nest depth.
- assert(hasMinimumLoopDepth(LoopList) && "Loop nest does not meet minimum depth.");
+ assert(hasMinimumLoopDepth(LoopList) &&
+ "Loop nest does not meet minimum depth.");
unsigned LoopNestDepth = LoopList.size();
if (LoopNestDepth > MaxLoopNestDepth) {
|
636cdf1
to
347b158
Compare
347b158
to
000adcf
Compare
Motivation for this patch. |
000adcf
to
494b81a
Compare
This patch bails out early if minimum depth is not met. As it stands today, the pass computes CacheCost before it attempts to do the transform. This is not needed if minimum depth is not met. This handles basic cases where depth is typically 1. As the patch avoids unnecessary computation, it is aimed to improve compile-time.
494b81a
to
38ceb08
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
This patch is an extension to llvm#115128. After profiling LLVM test-suite, I see a lot of loop nest of depth more than `MaxLoopNestDepth` which is 10. Early exit for them would save compile-time as it would avoid computing DependenceInfo and CacheCost.
This patch is an extension to llvm#115128. After profiling LLVM test-suite, I see a lot of loop nest of depth more than `MaxLoopNestDepth` which is 10. Early exit for them would save compile-time as it would avoid computing DependenceInfo and CacheCost.
This patch is an extension to llvm#115128. After profiling LLVM test-suite, I see a lot of loop nest of depth more than `MaxLoopNestDepth` which is 10. Early exit for them would save compile-time as it would avoid computing DependenceInfo and CacheCost.
) This patch is an extension to #115128. After profiling LLVM test-suite, I see a lot of loop nest of depth more than `MaxLoopNestDepth` which is 10. Early exit for them would save compile-time as it would avoid computing DependenceInfo and CacheCost. Please see 'bound-max-depth' branch on compile-time-tracker.
This is a work in progress patch to enable loop-interchange by default and is a continuation of the RFC: https://discourse.llvm.org/t/enabling-loop-interchange/82589 Basically, we promised to fix any compile-time and correctness issues in the different components involved here (loop-interchange and dependence analaysis.) before discussing enabling interchange by default. We think are close to complete this; I would like to explain where we are and wanted to check if there are any thoughts or concerns. A quick overview of the correctness and compile-time improvements that we have made include: Correctness: - [LoopInterchange] Remove 'S' Scalar Dependencies (llvm#119345) - [LoopInterchange] Fix overflow in cost calculation (llvm#111807) - [LoopInterchange] Handle LE and GE correctly (PR llvm#124901) @kasuga-fj - [DA] disambiguate evolution of base addresses (llvm#116628) Compile-times: - [LoopInterchange] Constrain number of load/stores in a loop (llvm#118973) - [LoopInterchange] Bail out early if minimum loop nest is not met (llvm#115128) - [LoopInterchange] Hoist isComputableLoopNest() in the control flow (llvm#124247) And in terms of remaining work, we think we are very close to fixing these depenence analysis issues: - [DA] do not handle array accesses of different offsets (llvm#123436) - [DA] Dependence analysis does not handle array accesses of different sizes (llvm#116630) - [DA] use NSW arithmetic llvm#116632 The compile-time increase with a geomean increase of 0.19% looks good (after committing llvm#124247), I think: stage1-O3: Benchmark kimwitu++ +0.10% sqlite3 +0.14% consumer-typeset +0.07% Bullet +0.06% tramp3d-v4 +0.21% mafft +0.39% ClamAVi +0.06% lencod +0.61% SPASS +0.17% 7zip +0.08% geomean +0.19% See also: http://llvm-compile-time-tracker.com/compare.php?from=19a7fe03b4f58c4f73ea91d5e63bc4c6e61f987b&to=b24f1367d68ee675ea93ecda4939208c6b68ae4b&stat=instructions%3Au We might want to look into lencod to see if we can improve more, but not sure it is strictly necessary.
This patch bails out early if minimum depth
is not met. As it stands today, the pass computes
CacheCost before it attempts to do the transform.
This is not needed if minimum depth is not met.
This handles basic cases where depth is typically 1.
As the patch avoids unnecessary computation, it is aimed to improve compile-time.