Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3b75363

Browse files
authored
Parse files in parallel when possible (#21175)
The idea is simple: new parser doesn't need the GIL, so we can parse files in parallel. Because it is tricky to apply parallelization _only_ to parallelizeable code, the most I see is ~4-5x speed-up with 8 threads, if I add more threads, it doesn't get visibly faster (I have 16 physical cores). Some notes on implementation: * I use stdlib `ThreadPoolExecutor`, it seems to work OK. * I refactored `parse_file()` a bit, so that we can parallelize (mostly) just the actual parsing. I see measurable degradation if I try to parallelize all of `parse_file()`. * I do not always use `psutil` because it is an optional dependency. We may want to actually make it a required dependency at some point. * It looks like there is a weird mypyc bug, that causes `ast_serialize` to be `None` sometimes in some threads. I simply add an ugly workaround for now. * It looks like I need to apply wrap_context() more consistently now. A bunch of tests used to pass accidentally before. * I only implement parallelization in the coordinator process. The workers counterpart can be done after #21119 is merged (it will be trivial).
1 parent 289f408 commit 3b75363

12 files changed

Lines changed: 387 additions & 159 deletions

misc/dump-ast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No
1919
options.python_version = python_version
2020
with open(fname, "rb") as f:
2121
s = f.read()
22-
tree = parse(s, fname, None, errors=Errors(options), options=options)
22+
tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True)
2323
if not quiet:
2424
print(tree)
2525

mypy/build.py

Lines changed: 210 additions & 77 deletions
Large diffs are not rendered by default.

mypy/checkstrformat.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
MemberExpr,
4040
MypyFile,
4141
NameExpr,
42-
Node,
4342
StarExpr,
4443
StrExpr,
4544
TempNode,
@@ -582,8 +581,13 @@ def apply_field_accessors(
582581

583582
temp_errors = Errors(self.chk.options)
584583
dummy = DUMMY_FIELD_NAME + spec.field[len(spec.key) :]
585-
temp_ast: Node = parse(
586-
dummy, fnam="<format>", module=None, options=self.chk.options, errors=temp_errors
584+
temp_ast, _ = parse(
585+
dummy,
586+
fnam="<format>",
587+
module=None,
588+
options=self.chk.options,
589+
errors=temp_errors,
590+
file_exists=False,
587591
)
588592
if temp_errors.is_errors():
589593
self.msg.fail(

mypy/nativeparse.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
from __future__ import annotations
2121

2222
import os
23-
from typing import Any, Final, cast
23+
import time
24+
from typing import Final, cast
2425

2526
import ast_serialize # type: ignore[import-untyped, import-not-found, unused-ignore]
2627
from librt.internal import (
@@ -101,6 +102,7 @@
101102
OpExpr,
102103
OverloadedFuncDef,
103104
OverloadPart,
105+
ParseError,
104106
PassStmt,
105107
RaiseStmt,
106108
RefExpr,
@@ -168,17 +170,11 @@
168170
class State:
169171
def __init__(self, options: Options) -> None:
170172
self.options = options
171-
self.errors: list[dict[str, Any]] = []
173+
self.errors: list[ParseError] = []
172174
self.num_funcs = 0
173175

174176
def add_error(
175-
self,
176-
message: str,
177-
line: int,
178-
column: int,
179-
*,
180-
blocker: bool = False,
181-
code: str | None = None,
177+
self, message: str, line: int, column: int, *, blocker: bool = False, code: str
182178
) -> None:
183179
"""Report an error at a specific location.
184180
@@ -196,7 +192,7 @@ def add_error(
196192

197193
def native_parse(
198194
filename: str, options: Options, skip_function_bodies: bool = False, imports_only: bool = False
199-
) -> tuple[MypyFile, list[dict[str, Any]], TypeIgnores]:
195+
) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
200196
"""Parse a Python file using the native Rust-based parser.
201197
202198
Uses the ast_serialize Rust extension to parse Python code and deserialize
@@ -214,7 +210,7 @@ def native_parse(
214210
Returns:
215211
A tuple containing:
216212
- MypyFile: The parsed AST as a mypy AST node
217-
- list[dict[str, Any]]: List of parse errors and deserialization errors
213+
- list[ParseError]: List of parse errors and deserialization errors
218214
- TypeIgnores: List of (line_number, ignored_codes) tuples for type: ignore comments
219215
"""
220216
# If the path is a directory, return empty AST (matching fastparse behavior)
@@ -272,7 +268,14 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:
272268

273269
def parse_to_binary_ast(
274270
filename: str, options: Options, skip_function_bodies: bool = False
275-
) -> tuple[bytes, list[dict[str, Any]], TypeIgnores, bytes, bool, bool]:
271+
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool]:
272+
# This is a horrible hack to work around a mypyc bug where imported
273+
# module may be not ready in a thread sometimes.
274+
t0 = time.time()
275+
while ast_serialize is None:
276+
time.sleep(0.0001) # type: ignore[unreachable]
277+
if time.time() - t0 > 10.0:
278+
raise ImportError("Cannot import ast_serialize")
276279
ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse(
277280
filename,
278281
skip_function_bodies=skip_function_bodies,
@@ -284,7 +287,7 @@ def parse_to_binary_ast(
284287
)
285288
return (
286289
ast_bytes,
287-
cast("list[dict[str, Any]]", errors),
290+
errors,
288291
ignores,
289292
import_bytes,
290293
ast_data["is_partial_package"],

mypy/nodes.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
Final,
1515
Optional,
1616
TypeAlias as _TypeAlias,
17+
TypedDict,
1718
TypeGuard,
1819
TypeVar,
1920
Union,
2021
cast,
2122
)
23+
from typing_extensions import NotRequired
2224

2325
from librt.internal import (
2426
extract_symbol,
@@ -39,7 +41,9 @@
3941
LIST_GEN,
4042
LIST_STR,
4143
LITERAL_COMPLEX,
44+
LITERAL_FALSE,
4245
LITERAL_NONE,
46+
LITERAL_TRUE,
4347
ReadBuffer,
4448
Tag,
4549
WriteBuffer,
@@ -313,6 +317,39 @@ def read(cls, data: ReadBuffer) -> SymbolNode:
313317
Definition: _TypeAlias = tuple[str, "SymbolTableNode", Optional["TypeInfo"]]
314318

315319

320+
class ParseError(TypedDict):
321+
line: int
322+
column: int
323+
message: str
324+
blocker: NotRequired[bool]
325+
code: NotRequired[str]
326+
327+
328+
def write_parse_error(data: WriteBuffer, err: ParseError) -> None:
329+
write_int(data, err["line"])
330+
write_int(data, err["column"])
331+
write_str(data, err["message"])
332+
if (blocker := err.get("blocker")) is not None:
333+
write_bool(data, blocker)
334+
else:
335+
write_tag(data, LITERAL_NONE)
336+
write_str_opt(data, err.get("code"))
337+
338+
339+
def read_parse_error(data: ReadBuffer) -> ParseError:
340+
err: ParseError = {"line": read_int(data), "column": read_int(data), "message": read_str(data)}
341+
tag = read_tag(data)
342+
if tag == LITERAL_TRUE:
343+
err["blocker"] = True
344+
elif tag == LITERAL_FALSE:
345+
err["blocker"] = False
346+
else:
347+
assert tag == LITERAL_NONE
348+
if (code := read_str_opt(data)) is not None:
349+
err["code"] = code
350+
return err
351+
352+
316353
class FileRawData:
317354
"""Raw (binary) data representing parsed, but not deserialized file."""
318355

@@ -327,7 +364,7 @@ class FileRawData:
327364

328365
defs: bytes
329366
imports: bytes
330-
raw_errors: list[dict[str, Any]] # TODO: switch to more precise type here.
367+
raw_errors: list[ParseError]
331368
ignored_lines: dict[int, list[str]]
332369
is_partial_stub_package: bool
333370
uses_template_strings: bool
@@ -336,7 +373,7 @@ def __init__(
336373
self,
337374
defs: bytes,
338375
imports: bytes,
339-
raw_errors: list[dict[str, Any]],
376+
raw_errors: list[ParseError],
340377
ignored_lines: dict[int, list[str]],
341378
is_partial_stub_package: bool,
342379
uses_template_strings: bool,
@@ -354,7 +391,7 @@ def write(self, data: WriteBuffer) -> None:
354391
write_tag(data, LIST_GEN)
355392
write_int_bare(data, len(self.raw_errors))
356393
for err in self.raw_errors:
357-
write_json(data, err)
394+
write_parse_error(data, err)
358395
write_tag(data, DICT_INT_GEN)
359396
write_int_bare(data, len(self.ignored_lines))
360397
for line, codes in self.ignored_lines.items():
@@ -368,7 +405,7 @@ def read(cls, data: ReadBuffer) -> FileRawData:
368405
defs = read_bytes(data)
369406
imports = read_bytes(data)
370407
assert read_tag(data) == LIST_GEN
371-
raw_errors = [read_json(data) for _ in range(read_int_bare(data))]
408+
raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))]
372409
assert read_tag(data) == DICT_INT_GEN
373410
ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))}
374411
return FileRawData(

mypy/parse.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
from __future__ import annotations
22

3-
import os
43
import re
54

65
from librt.internal import ReadBuffer
76

87
from mypy import errorcodes as codes
98
from mypy.cache import read_int
109
from mypy.errors import Errors
11-
from mypy.nodes import FileRawData, MypyFile
10+
from mypy.nodes import FileRawData, MypyFile, ParseError
1211
from mypy.options import Options
1312

1413

@@ -18,9 +17,9 @@ def parse(
1817
module: str | None,
1918
errors: Errors,
2019
options: Options,
21-
raise_on_error: bool = False,
20+
file_exists: bool,
2221
imports_only: bool = False,
23-
) -> MypyFile:
22+
) -> tuple[MypyFile, list[ParseError]]:
2423
"""Parse a source file, without doing any semantic analysis.
2524
2625
Return the parse tree. If errors is not provided, raise ParseError
@@ -31,14 +30,12 @@ def parse(
3130
if options.native_parser:
3231
# Native parser only works with actual files on disk
3332
# Fall back to fastparse for in-memory source or non-existent files
34-
if os.path.exists(fnam):
33+
if file_exists:
3534
import mypy.nativeparse
3635

3736
ignore_errors = options.ignore_errors or fnam in errors.ignored_files
3837
# If errors are ignored, we can drop many function bodies to speed up type checking.
3938
strip_function_bodies = ignore_errors and not options.preserve_asts
40-
41-
errors.set_file(fnam, module, options=options)
4239
tree, parse_errors, type_ignores = mypy.nativeparse.native_parse(
4340
fnam,
4441
options,
@@ -51,26 +48,7 @@ def parse(
5148
tree.is_stub = fnam.endswith(".pyi")
5249
# Note: tree.imports is populated directly by native_parse with deserialized
5350
# import metadata, so we don't need to collect imports via AST traversal
54-
55-
# Report parse errors
56-
for error in parse_errors:
57-
message = error["message"]
58-
# Standardize error message by capitalizing the first word
59-
message = re.sub(r"^(\s*\w)", lambda m: m.group(1).upper(), message)
60-
# Respect blocker status from error, default to True for syntax errors
61-
is_blocker = error.get("blocker", True)
62-
error_code = error.get("code")
63-
if error_code is None:
64-
error_code = codes.SYNTAX
65-
else:
66-
# Fallback to [syntax] for backwards compatibility.
67-
error_code = codes.error_codes.get(error_code) or codes.SYNTAX
68-
errors.report(
69-
error["line"], error["column"], message, blocker=is_blocker, code=error_code
70-
)
71-
if raise_on_error and errors.is_errors():
72-
errors.raise_error()
73-
return tree
51+
return tree, parse_errors
7452
# Fall through to fastparse for non-existent files
7553

7654
assert not imports_only
@@ -79,9 +57,7 @@ def parse(
7957
import mypy.fastparse
8058

8159
tree = mypy.fastparse.parse(source, fnam=fnam, module=module, errors=errors, options=options)
82-
if raise_on_error and errors.is_errors():
83-
errors.raise_error()
84-
return tree
60+
return tree, []
8561

8662

8763
def load_from_raw(
@@ -112,14 +88,21 @@ def load_from_raw(
11288
all_errors = raw_data.raw_errors + state.errors
11389
errors.set_file(fnam, module, options=options)
11490
for error in all_errors:
115-
message = error["message"]
116-
message = re.sub(r"^(\s*\w)", lambda m: m.group(1).upper(), message)
117-
is_blocker = error.get("blocker", True)
118-
error_code = error.get("code")
119-
if error_code is None:
120-
error_code = codes.SYNTAX
121-
else:
122-
error_code = codes.error_codes.get(error_code) or codes.SYNTAX
12391
# Note we never raise in this function, so it should not be called in coordinator.
124-
errors.report(error["line"], error["column"], message, blocker=is_blocker, code=error_code)
92+
report_parse_error(error, errors)
12593
return tree
94+
95+
96+
def report_parse_error(error: ParseError, errors: Errors) -> None:
97+
message = error["message"]
98+
# Standardize error message by capitalizing the first word
99+
message = re.sub(r"^(\s*\w)", lambda m: m.group(1).upper(), message)
100+
# Respect blocker status from error, default to True for syntax errors
101+
is_blocker = error.get("blocker", True)
102+
error_code = error.get("code")
103+
if error_code is None:
104+
error_code = codes.SYNTAX
105+
else:
106+
# Fallback to [syntax] for backwards compatibility.
107+
error_code = codes.error_codes.get(error_code) or codes.SYNTAX
108+
errors.report(error["line"], error["column"], message, blocker=is_blocker, code=error_code)

mypy/semanal_main.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -463,17 +463,18 @@ def apply_class_plugin_hooks(graph: Graph, scc: list[str], errors: Errors) -> No
463463
state = graph[module]
464464
tree = state.tree
465465
assert tree
466-
for _, node, _ in tree.local_definitions():
467-
if isinstance(node.node, TypeInfo):
468-
if not apply_hooks_to_class(
469-
state.manager.semantic_analyzer,
470-
module,
471-
node.node,
472-
state.options,
473-
tree,
474-
errors,
475-
):
476-
incomplete = True
466+
with state.wrap_context():
467+
for _, node, _ in tree.local_definitions():
468+
if isinstance(node.node, TypeInfo):
469+
if not apply_hooks_to_class(
470+
state.manager.semantic_analyzer,
471+
module,
472+
node.node,
473+
state.options,
474+
tree,
475+
errors,
476+
):
477+
incomplete = True
477478

478479

479480
def apply_hooks_to_class(
@@ -524,7 +525,10 @@ def calculate_class_properties(graph: Graph, scc: list[str], errors: Errors) ->
524525
assert tree
525526
for _, node, _ in tree.local_definitions():
526527
if isinstance(node.node, TypeInfo):
527-
with state.manager.semantic_analyzer.file_context(tree, state.options, node.node):
528+
with (
529+
state.wrap_context(),
530+
state.manager.semantic_analyzer.file_context(tree, state.options, node.node),
531+
):
528532
calculate_class_abstract_status(node.node, tree.is_stub, errors)
529533
check_protocol_status(node.node, errors)
530534
calculate_class_vars(node.node)

mypy/stubgen.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,10 +1744,17 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None:
17441744
data = f.read()
17451745
source = mypy.util.decode_python_encoding(data)
17461746
errors = Errors(mypy_options)
1747-
mod.ast = mypy.parse.parse(
1748-
source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options
1747+
mod.ast, errs = mypy.parse.parse(
1748+
source,
1749+
fnam=mod.path,
1750+
module=mod.module,
1751+
errors=errors,
1752+
options=mypy_options,
1753+
file_exists=True,
17491754
)
17501755
mod.ast._fullname = mod.module
1756+
for err in errs:
1757+
mypy.parse.report_parse_error(err, errors)
17511758
if errors.is_blockers():
17521759
# Syntax error!
17531760
for m in errors.new_messages():

0 commit comments

Comments
 (0)