Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ba2374f

Browse files
authored
Use faster algorithm for topological sort (#20790)
In a large codebase, up to 9% of CPU was used in `topsort` when doing a small incremental run. This should make it significantly faster (I've verified that it's faster at least when using synthetic data). Use Kahn's algorithm, since it's O(V + E) rather than O(depth * V) for the original algorithm. Description of the algorithm: https://www.geeksforgeeks.org/dsa/topological-sorting-indegree-based-solution/ `perf_compare.py` showed a small improvement in self check performance, but the difference is below the noise floor. This will likely mostly help with larger codebases. Keep the old `topsort` function around for now, so that we can test that the new and old functions behave identically in tests. I'll remove the old one afterwards. I used coding agent assist for this, but I did the implementation in multiple small increments.
1 parent 157b87d commit ba2374f

4 files changed

Lines changed: 149 additions & 13 deletions

File tree

mypy/build.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
ErrorTupleRaw,
9595
report_internal_error,
9696
)
97-
from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort
97+
from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort2
9898
from mypy.indirection import TypeIndirectionVisitor
9999
from mypy.ipc import BadStatus, IPCClient, IPCMessage, read_status, ready_to_read, receive, send
100100
from mypy.messages import MessageBuilder
@@ -4236,7 +4236,7 @@ def sorted_components(graph: Graph) -> list[SCC]:
42364236
scc_dep_map = prepare_sccs_full(strongly_connected_components(vertices, edges), edges)
42374237
# Topsort.
42384238
res = []
4239-
for ready in topsort(scc_dep_map):
4239+
for ready in topsort2(scc_dep_map):
42404240
# Sort the sets in ready by reversed smallest State.order. Examples:
42414241
#
42424242
# - If ready is [{x}, {y}], x.order == 1, y.order == 2, we get
@@ -4271,7 +4271,7 @@ def sorted_components_inner(
42714271
edges = {id: deps_filtered(graph, vertices, id, pri_max) for id in vertices}
42724272
sccs = list(strongly_connected_components(vertices, edges))
42734273
res = []
4274-
for ready in topsort(prepare_sccs(sccs, edges)):
4274+
for ready in topsort2(prepare_sccs(sccs, edges)):
42754275
res.extend(sorted(ready, key=lambda scc: -min(graph[id].order for id in scc)))
42764276
return res
42774277

mypy/graph_utils.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,76 @@ def topsort(data: dict[T, set[T]]) -> Iterable[set[T]]:
115115
yield ready
116116
data = {item: (dep - ready) for item, dep in data.items() if item not in ready}
117117
assert not data, f"A cyclic dependency exists amongst {data!r}"
118+
119+
120+
class topsort2(Iterator[set[T]]): # noqa: N801
121+
"""Topological sort using Kahn's algorithm.
122+
123+
This is functionally equivalent to topsort() but avoids rebuilding
124+
the full dict and set objects on each iteration. Instead it uses
125+
in-degree counters and a reverse adjacency list, so the total work
126+
is O(V + E) rather than O(depth * V).
127+
128+
Implemented as a class rather than a generator for better mypyc
129+
compilation.
130+
131+
Args:
132+
data: A map from vertices to all vertices that it has an edge
133+
connecting it to. NOTE: This data structure
134+
is modified in place -- for normalization purposes,
135+
self-dependencies are removed and entries representing
136+
orphans are added.
137+
"""
138+
139+
def __init__(self, data: dict[T, set[T]]) -> None:
140+
# Single pass: remove self-deps, build reverse adjacency list,
141+
# compute in-degree counts, detect orphans, and find initial ready set.
142+
in_degree: dict[T, int] = {}
143+
rev: dict[T, list[T]] = {}
144+
ready: set[T] = set()
145+
for item, deps in data.items():
146+
deps.discard(item) # Ignore self dependencies.
147+
deg = len(deps)
148+
in_degree[item] = deg
149+
if deg == 0:
150+
ready.add(item)
151+
if item not in rev:
152+
rev[item] = []
153+
for dep in deps:
154+
if dep in rev:
155+
rev[dep].append(item)
156+
else:
157+
rev[dep] = [item]
158+
if dep not in data:
159+
# Orphan: appears as dependency but has no entry in data.
160+
in_degree[dep] = 0
161+
ready.add(dep)
162+
163+
self.in_degree = in_degree
164+
self.rev = rev
165+
self.ready = ready
166+
self.remaining = len(in_degree) - len(ready)
167+
168+
def __iter__(self) -> Iterator[set[T]]:
169+
return self
170+
171+
def __next__(self) -> set[T]:
172+
ready = self.ready
173+
if not ready:
174+
assert self.remaining == 0, (
175+
f"A cyclic dependency exists amongst "
176+
f"{[k for k, deg in self.in_degree.items() if deg > 0]!r}"
177+
)
178+
raise StopIteration
179+
in_degree = self.in_degree
180+
rev = self.rev
181+
new_ready: set[T] = set()
182+
for item in ready:
183+
for dependent in rev[item]:
184+
new_deg = in_degree[dependent] - 1
185+
in_degree[dependent] = new_deg
186+
if new_deg == 0:
187+
new_ready.add(dependent)
188+
self.remaining -= len(new_ready)
189+
self.ready = new_ready
190+
return ready

mypy/solve.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from mypy.constraints import SUBTYPE_OF, SUPERTYPE_OF, Constraint, infer_constraints, neg_op
1010
from mypy.expandtype import expand_type
11-
from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort
11+
from mypy.graph_utils import prepare_sccs, strongly_connected_components, topsort2
1212
from mypy.join import join_type_list
1313
from mypy.meet import meet_type_list, meet_types
1414
from mypy.subtypes import is_subtype
@@ -147,7 +147,7 @@ def solve_with_dependent(
147147
sccs = list(strongly_connected_components(set(vars), dmap))
148148
if not all(check_linear(scc, lowers, uppers) for scc in sccs):
149149
return {}, []
150-
raw_batches = list(topsort(prepare_sccs(sccs, dmap)))
150+
raw_batches = list(topsort2(prepare_sccs(sccs, dmap)))
151151

152152
free_vars = []
153153
free_solutions = {}

mypy/test/testgraph.py

Lines changed: 71 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mypy.build import BuildManager, BuildSourceSet, State, order_ascc, sorted_components
99
from mypy.errors import Errors
1010
from mypy.fscache import FileSystemCache
11-
from mypy.graph_utils import strongly_connected_components, topsort
11+
from mypy.graph_utils import strongly_connected_components, topsort, topsort2
1212
from mypy.modulefinder import SearchPaths
1313
from mypy.options import Options
1414
from mypy.plugin import Plugin
@@ -18,14 +18,77 @@
1818

1919

2020
class GraphSuite(Suite):
21+
def test_topsort_empty(self) -> None:
22+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {}
23+
assert_equal(list(topsort2(data)), [])
24+
2125
def test_topsort(self) -> None:
22-
a = frozenset({"A"})
23-
b = frozenset({"B"})
24-
c = frozenset({"C"})
25-
d = frozenset({"D"})
26-
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}, b: {d}, c: {d}}
27-
res = list(topsort(data))
28-
assert_equal(res, [{d}, {b, c}, {a}])
26+
for topsort_func in [topsort, topsort2]:
27+
a = frozenset({"A"})
28+
b = frozenset({"B"})
29+
c = frozenset({"C"})
30+
d = frozenset({"D"})
31+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}, b: {d}, c: {d}}
32+
res = list(topsort_func(data))
33+
assert_equal(res, [{d}, {b, c}, {a}])
34+
35+
def test_topsort_orphan(self) -> None:
36+
for topsort_func in [topsort, topsort2]:
37+
a = frozenset({"A"})
38+
b = frozenset({"B"})
39+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b}}
40+
res = list(topsort_func(data))
41+
assert_equal(res, [{b}, {a}])
42+
43+
def test_topsort_independent(self) -> None:
44+
for topsort_func in [topsort, topsort2]:
45+
a = frozenset({"A"})
46+
b = frozenset({"B"})
47+
c = frozenset({"C"})
48+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: set(), b: set(), c: set()}
49+
res = list(topsort_func(data))
50+
assert_equal(res, [{a, b, c}])
51+
52+
def test_topsort_linear_chain(self) -> None:
53+
for topsort_func in [topsort, topsort2]:
54+
a = frozenset({"A"})
55+
b = frozenset({"B"})
56+
c = frozenset({"C"})
57+
d = frozenset({"D"})
58+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {
59+
a: {b},
60+
b: {c},
61+
c: {d},
62+
d: set(),
63+
}
64+
res = list(topsort_func(data))
65+
assert_equal(res, [{d}, {c}, {b}, {a}])
66+
67+
def test_topsort_self_dependency(self) -> None:
68+
for topsort_func in [topsort, topsort2]:
69+
a = frozenset({"A"})
70+
b = frozenset({"B"})
71+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {a, b}, b: set()}
72+
res = list(topsort_func(data))
73+
assert_equal(res, [{b}, {a}])
74+
75+
def test_topsort_orphan_diamond(self) -> None:
76+
for topsort_func in [topsort, topsort2]:
77+
a = frozenset({"A"})
78+
b = frozenset({"B"})
79+
c = frozenset({"C"})
80+
# B and C are orphans -- they appear only in values, not as keys.
81+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b, c}}
82+
res = list(topsort_func(data))
83+
assert_equal(res, [{b, c}, {a}])
84+
85+
def test_topsort_cycle(self) -> None:
86+
for topsort_func in [topsort, topsort2]:
87+
a = frozenset({"A"})
88+
b = frozenset({"B"})
89+
data: dict[AbstractSet[str], set[AbstractSet[str]]] = {a: {b}, b: {a}}
90+
with self.assertRaises(AssertionError):
91+
list(topsort_func(data))
2992

3093
def test_scc(self) -> None:
3194
vertices = {"A", "B", "C", "D"}

0 commit comments

Comments
 (0)