Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 781f1e6

Browse files
authored
Fix sequential bottleneck in parallel parsing (#21291)
Previously we always read the file, processed inline comments, and calculated sha1 for each parsed file sequentially in Python. Now these are mostly moved to the Rust extension, which allows better parallel scaling. I measured ~5% improvement to parallel type checking times in some cases on macOS (though it was a bit noisy, and used an earlier version of this PR). Related to #21215.
1 parent 3bf45e6 commit 781f1e6

6 files changed

Lines changed: 140 additions & 45 deletions

File tree

mypy/build.py

Lines changed: 80 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -953,7 +953,7 @@ def __init__(
953953
# until all the files have been added. This means that a
954954
# new file can be processed O(n**2) times. This cache
955955
# avoids most of this redundant work.
956-
self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo]]] = {}
956+
self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo], str | None]] = {}
957957
# Number of times we used GC optimization hack for fresh SCCs.
958958
self.gc_freeze_cycles = 0
959959
# Mapping from SCC id to corresponding SCC instance. This is populated
@@ -1039,11 +1039,66 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
10391039
as an optimization to parallelize only those parts of the code that can be
10401040
parallelized efficiently.
10411041
"""
1042+
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
1043+
sequential_states, parallel_states
1044+
)
1045+
1046+
for state in parallel_parsed_states:
1047+
# New parser returns serialized ASTs. Deserialize full trees only if not using
1048+
# parallel workers.
1049+
with state.wrap_context():
1050+
assert state.tree is not None
1051+
raw_data = state.tree.raw_data
1052+
if raw_data is not None:
1053+
# Apply inline mypy config before deserialization, since
1054+
# some options (e.g. implicit_optional) affect deserialization
1055+
state.source_hash = raw_data.source_hash
1056+
state.apply_inline_configuration(raw_data.mypy_comments)
1057+
state.tree = load_from_raw(
1058+
state.xpath,
1059+
state.id,
1060+
raw_data,
1061+
self.errors,
1062+
state.options,
1063+
imports_only=bool(self.workers),
1064+
)
1065+
if self.errors.is_blockers():
1066+
self.log("Bailing due to parse errors")
1067+
self.errors.raise_error()
1068+
1069+
for state in parallel_states:
1070+
assert state.tree is not None
1071+
if state in parallel_parsed_states_set:
1072+
if state.tree.raw_data is not None:
1073+
# source_hash was already extracted above, but raw_data
1074+
# may have been preserved for workers (imports_only=True).
1075+
pass
1076+
elif state.source_hash is None:
1077+
# At least namespace packages may not have source.
1078+
state.get_source()
1079+
state.size_hint = os.path.getsize(state.xpath)
1080+
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
1081+
state.semantic_analysis_pass1()
1082+
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
1083+
self.modules[state.id] = state.tree
1084+
state.check_blockers()
1085+
state.setup_errors()
1086+
1087+
def parse_files_threaded_raw(
1088+
self, sequential_states: list[State], parallel_states: list[State]
1089+
) -> tuple[list[State], set[State]]:
1090+
"""Parse files using a thread pool.
1091+
1092+
Also parse sequential states while waiting for the parallel results.
1093+
Trees from the new parser are left in raw (serialized) form.
1094+
1095+
Return (list, set) of states that were actually parsed (not cached).
1096+
"""
10421097
futures = []
10431098
# Use both list and a set to have more predictable order of errors,
10441099
# while also not sacrificing performance.
1045-
parallel_parsed_states = []
1046-
parallel_parsed_states_set = set()
1100+
parallel_parsed_states: list[State] = []
1101+
parallel_parsed_states_set: set[State] = set()
10471102
# Use at least --num-workers if specified by user.
10481103
available_threads = max(get_available_threads(), self.options.num_workers)
10491104
# Overhead from trying to parallelize (small) blocking portion of
@@ -1052,53 +1107,27 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
10521107
with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
10531108
for state in parallel_states:
10541109
state.needs_parse = False
1055-
# New parser reads source from file directly, we do this only for
1056-
# the side effect of parsing inline mypy configurations.
1057-
state.get_source()
10581110
if state.id not in self.ast_cache:
10591111
self.log(f"Parsing {state.xpath} ({state.id})")
10601112
ignore_errors = state.ignore_all or state.options.ignore_errors
10611113
if ignore_errors:
10621114
self.errors.ignored_files.add(state.xpath)
1063-
futures.append(executor.submit(state.parse_file_inner, state.source or ""))
1115+
futures.append(executor.submit(state.parse_file_inner, ""))
10641116
parallel_parsed_states.append(state)
10651117
parallel_parsed_states_set.add(state)
10661118
else:
10671119
self.log(f"Using cached AST for {state.xpath} ({state.id})")
1068-
state.tree, state.early_errors = self.ast_cache[state.id]
1120+
state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
1121+
state.source_hash = source_hash
10691122

10701123
# Parse sequential before waiting on parallel.
10711124
for state in sequential_states:
10721125
state.parse_file()
10731126

10741127
for fut in wait(futures).done:
10751128
fut.result()
1076-
for state in parallel_parsed_states:
1077-
# New parser returns serialized trees that need to be de-serialized.
1078-
with state.wrap_context():
1079-
assert state.tree is not None
1080-
if state.tree.raw_data:
1081-
state.tree = load_from_raw(
1082-
state.xpath,
1083-
state.id,
1084-
state.tree.raw_data,
1085-
self.errors,
1086-
state.options,
1087-
imports_only=bool(self.workers),
1088-
)
1089-
if self.errors.is_blockers():
1090-
self.log("Bailing due to parse errors")
1091-
self.errors.raise_error()
10921129

1093-
for state in parallel_states:
1094-
assert state.tree is not None
1095-
if state in parallel_parsed_states_set:
1096-
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
1097-
state.semantic_analysis_pass1()
1098-
self.ast_cache[state.id] = (state.tree, state.early_errors)
1099-
self.modules[state.id] = state.tree
1100-
state.check_blockers()
1101-
state.setup_errors()
1130+
return parallel_parsed_states, parallel_parsed_states_set
11021131

11031132
def post_parse_all(self, states: list[State]) -> None:
11041133
for state in states:
@@ -3090,7 +3119,6 @@ def get_source(self) -> str:
30903119
self.source_hash = compute_hash(source)
30913120

30923121
self.parse_inline_configuration(source)
3093-
self.check_for_invalid_options()
30943122

30953123
self.size_hint = len(source)
30963124
self.time_spent_us += time_spent_us(t0)
@@ -3115,7 +3143,10 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
31153143
# The file was already parsed.
31163144
return
31173145

3118-
source = self.get_source()
3146+
if raw_data is None:
3147+
source = self.get_source()
3148+
else:
3149+
source = ""
31193150
manager = self.manager
31203151
# Can we reuse a previously parsed AST? This avoids redundant work in daemon.
31213152
if self.id not in manager.ast_cache:
@@ -3125,6 +3156,12 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
31253156
self.manager.errors.ignored_files.add(self.xpath)
31263157
with self.wrap_context():
31273158
manager.errors.set_file(self.xpath, self.id, options=self.options)
3159+
if raw_data is not None:
3160+
# Apply inline mypy config before deserialization, since
3161+
# some options (e.g. implicit_optional) affect how the
3162+
# AST is built during deserialization.
3163+
self.source_hash = raw_data.source_hash
3164+
self.apply_inline_configuration(raw_data.mypy_comments)
31283165
self.parse_file_inner(source, raw_data)
31293166
assert self.tree is not None
31303167
# New parser returns serialized trees that need to be de-serialized.
@@ -3149,14 +3186,15 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
31493186
else:
31503187
# Reuse a cached AST
31513188
manager.log(f"Using cached AST for {self.xpath} ({self.id})")
3152-
self.tree, self.early_errors = manager.ast_cache[self.id]
3189+
self.tree, self.early_errors, source_hash = manager.ast_cache[self.id]
3190+
self.source_hash = source_hash
31533191

31543192
assert self.tree is not None
31553193
if not temporary:
31563194
manager.modules[self.id] = self.tree
31573195
self.check_blockers()
31583196

3159-
manager.ast_cache[self.id] = (self.tree, self.early_errors)
3197+
manager.ast_cache[self.id] = (self.tree, self.early_errors, self.source_hash)
31603198
self.setup_errors()
31613199

31623200
def setup_errors(self) -> None:
@@ -3169,12 +3207,17 @@ def setup_errors(self) -> None:
31693207
def parse_inline_configuration(self, source: str) -> None:
31703208
"""Check for inline mypy: options directive and parse them."""
31713209
flags = get_mypy_comments(source)
3210+
self.apply_inline_configuration(flags)
3211+
3212+
def apply_inline_configuration(self, flags: list[tuple[int, str]] | None) -> None:
3213+
"""Apply inline mypy configuration comments and check for invalid options."""
31723214
if flags:
31733215
changes, config_errors = parse_mypy_comments(flags, self.options)
31743216
self.options = self.options.apply_changes(changes)
31753217
self.manager.errors.set_file(self.xpath, self.id, self.options)
31763218
for lineno, error in config_errors:
31773219
self.manager.error(lineno, error)
3220+
self.check_for_invalid_options()
31783221

31793222
def check_for_invalid_options(self) -> None:
31803223
if self.options.mypyc and not self.options.strict_bytes:

mypy/nativeparse.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -210,13 +210,27 @@ def native_parse(
210210
node.path = filename
211211
return node, [], []
212212

213-
b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = (
214-
parse_to_binary_ast(filename, options, skip_function_bodies)
215-
)
213+
(
214+
b,
215+
errors,
216+
ignores,
217+
import_bytes,
218+
is_partial_package,
219+
uses_template_strings,
220+
source_hash,
221+
mypy_comments,
222+
) = parse_to_binary_ast(filename, options, skip_function_bodies)
216223
node = MypyFile([], [])
217224
node.path = filename
218225
node.raw_data = FileRawData(
219-
b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings
226+
b,
227+
import_bytes,
228+
errors,
229+
dict(ignores),
230+
is_partial_package,
231+
uses_template_strings,
232+
source_hash,
233+
mypy_comments,
220234
)
221235
return node, errors, ignores
222236

@@ -243,7 +257,7 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:
243257

244258
def parse_to_binary_ast(
245259
filename: str, options: Options, skip_function_bodies: bool = False
246-
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool]:
260+
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
247261
# This is a horrible hack to work around a mypyc bug where imported
248262
# module may be not ready in a thread sometimes.
249263
t0 = time.time()
@@ -267,6 +281,8 @@ def parse_to_binary_ast(
267281
import_bytes,
268282
ast_data["is_partial_package"],
269283
ast_data["uses_template_strings"],
284+
ast_data["source_hash"],
285+
ast_data["mypy_comments"],
270286
)
271287

272288

mypy/nodes.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,8 @@ class FileRawData:
360360
"ignored_lines",
361361
"is_partial_stub_package",
362362
"uses_template_strings",
363+
"source_hash",
364+
"mypy_comments",
363365
)
364366

365367
defs: bytes
@@ -368,6 +370,8 @@ class FileRawData:
368370
ignored_lines: dict[int, list[str]]
369371
is_partial_stub_package: bool
370372
uses_template_strings: bool
373+
source_hash: str
374+
mypy_comments: list[tuple[int, str]]
371375

372376
def __init__(
373377
self,
@@ -377,13 +381,17 @@ def __init__(
377381
ignored_lines: dict[int, list[str]],
378382
is_partial_stub_package: bool,
379383
uses_template_strings: bool,
384+
source_hash: str = "",
385+
mypy_comments: list[tuple[int, str]] | None = None,
380386
) -> None:
381387
self.defs = defs
382388
self.imports = imports
383389
self.raw_errors = raw_errors
384390
self.ignored_lines = ignored_lines
385391
self.is_partial_stub_package = is_partial_stub_package
386392
self.uses_template_strings = uses_template_strings
393+
self.source_hash = source_hash
394+
self.mypy_comments = mypy_comments if mypy_comments is not None else []
387395

388396
def write(self, data: WriteBuffer) -> None:
389397
write_bytes(data, self.defs)
@@ -399,6 +407,12 @@ def write(self, data: WriteBuffer) -> None:
399407
write_str_list(data, codes)
400408
write_bool(data, self.is_partial_stub_package)
401409
write_bool(data, self.uses_template_strings)
410+
write_str(data, self.source_hash)
411+
write_tag(data, LIST_GEN)
412+
write_int_bare(data, len(self.mypy_comments))
413+
for line, text in self.mypy_comments:
414+
write_int(data, line)
415+
write_str(data, text)
402416

403417
@classmethod
404418
def read(cls, data: ReadBuffer) -> FileRawData:
@@ -408,8 +422,20 @@ def read(cls, data: ReadBuffer) -> FileRawData:
408422
raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))]
409423
assert read_tag(data) == DICT_INT_GEN
410424
ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))}
425+
is_partial_stub_package = read_bool(data)
426+
uses_template_strings = read_bool(data)
427+
source_hash = read_str(data)
428+
assert read_tag(data) == LIST_GEN
429+
mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))]
411430
return FileRawData(
412-
defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data)
431+
defs,
432+
imports,
433+
raw_errors,
434+
ignored_lines,
435+
is_partial_stub_package,
436+
uses_template_strings,
437+
source_hash,
438+
mypy_comments,
413439
)
414440

415441

mypy/parse.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,11 @@ def load_from_raw(
6464
options: Options,
6565
imports_only: bool = False,
6666
) -> MypyFile:
67-
"""Load AST from parsed binary data and report stored errors."""
67+
"""Load AST from parsed binary data and report stored errors.
68+
69+
If imports_only is true, only deserialize imports and return a mostly
70+
empty AST.
71+
"""
6872
from mypy.nativeparse import State, deserialize_imports, read_statements
6973

7074
state = State(options)

mypy/test/test_nativeparse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->
251251
]
252252

253253
with temp_source("print('hello')") as fnam:
254-
b, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
254+
b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
255255
assert list(b) == (
256256
[LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR]
257257
+ [nodes.NAME_EXPR, LITERAL_STR]

test-data/unit/check-optional.test

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,3 +1356,9 @@ def f(x: object) -> None:
13561356
with C():
13571357
pass
13581358
[builtins fixtures/tuple.pyi]
1359+
1360+
[case testInferOptionalFromDefaultNoneInlineConfig]
1361+
# mypy: implicit-optional
1362+
def f(x: int = None) -> None:
1363+
reveal_type(x) # N: Revealed type is "builtins.int | None"
1364+
f(None)

0 commit comments

Comments
 (0)