Fix sequential bottleneck in parallel parsing (#21291)

JukkaL · web-flow · commit 781f1e644ccf · 2026-04-22T18:01:24.000+01:00
Previously we always read the file, processed inline comments, and calculated sha1 for each parsed file sequentially in Python. Now these are mostly moved to the Rust extension, which allows better parallel scaling. I measured ~5% improvement to parallel type checking times in some cases on macOS (though it was a bit noisy, and used an earlier version of this PR). Related to #21215.
diff --git a/mypy/build.py b/mypy/build.py
@@ -953,7 +953,7 @@ def __init__(
         # until all the files have been added. This means that a
         # new file can be processed O(n**2) times. This cache
         # avoids most of this redundant work.
-        self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo]]] = {}
+        self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo], str | None]] = {}
         # Number of times we used GC optimization hack for fresh SCCs.
         self.gc_freeze_cycles = 0
         # Mapping from SCC id to corresponding SCC instance. This is populated
@@ -1039,11 +1039,66 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
         as an optimization to parallelize only those parts of the code that can be
         parallelized efficiently.
         """
+        parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
+            sequential_states, parallel_states
+        )
+
+        for state in parallel_parsed_states:
+            # New parser returns serialized ASTs. Deserialize full trees only if not using
+            # parallel workers.
+            with state.wrap_context():
+                assert state.tree is not None
+                raw_data = state.tree.raw_data
+                if raw_data is not None:
+                    # Apply inline mypy config before deserialization, since
+                    # some options (e.g. implicit_optional) affect deserialization
+                    state.source_hash = raw_data.source_hash
+                    state.apply_inline_configuration(raw_data.mypy_comments)
+                    state.tree = load_from_raw(
+                        state.xpath,
+                        state.id,
+                        raw_data,
+                        self.errors,
+                        state.options,
+                        imports_only=bool(self.workers),
+                    )
+                if self.errors.is_blockers():
+                    self.log("Bailing due to parse errors")
+                    self.errors.raise_error()
+
+        for state in parallel_states:
+            assert state.tree is not None
+            if state in parallel_parsed_states_set:
+                if state.tree.raw_data is not None:
+                    # source_hash was already extracted above, but raw_data
+                    # may have been preserved for workers (imports_only=True).
+                    pass
+                elif state.source_hash is None:
+                    # At least namespace packages may not have source.
+                    state.get_source()
+                state.size_hint = os.path.getsize(state.xpath)
+                state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
+                state.semantic_analysis_pass1()
+                self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
+            self.modules[state.id] = state.tree
+            state.check_blockers()
+            state.setup_errors()
+
+    def parse_files_threaded_raw(
+        self, sequential_states: list[State], parallel_states: list[State]
+    ) -> tuple[list[State], set[State]]:
+        """Parse files using a thread pool.
+
+        Also parse sequential states while waiting for the parallel results.
+        Trees from the new parser are left in raw (serialized) form.
+
+        Return (list, set) of states that were actually parsed (not cached).
+        """
         futures = []
         # Use both list and a set to have more predictable order of errors,
         # while also not sacrificing performance.
-        parallel_parsed_states = []
-        parallel_parsed_states_set = set()
+        parallel_parsed_states: list[State] = []
+        parallel_parsed_states_set: set[State] = set()
         # Use at least --num-workers if specified by user.
         available_threads = max(get_available_threads(), self.options.num_workers)
         # Overhead from trying to parallelize (small) blocking portion of
@@ -1052,53 +1107,27 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
         with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
             for state in parallel_states:
                 state.needs_parse = False
-                # New parser reads source from file directly, we do this only for
-                # the side effect of parsing inline mypy configurations.
-                state.get_source()
                 if state.id not in self.ast_cache:
                     self.log(f"Parsing {state.xpath} ({state.id})")
                     ignore_errors = state.ignore_all or state.options.ignore_errors
                     if ignore_errors:
                         self.errors.ignored_files.add(state.xpath)
-                    futures.append(executor.submit(state.parse_file_inner, state.source or ""))
+                    futures.append(executor.submit(state.parse_file_inner, ""))
                     parallel_parsed_states.append(state)
                     parallel_parsed_states_set.add(state)
                 else:
                     self.log(f"Using cached AST for {state.xpath} ({state.id})")
-                    state.tree, state.early_errors = self.ast_cache[state.id]
+                    state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
+                    state.source_hash = source_hash
 
             # Parse sequential before waiting on parallel.
             for state in sequential_states:
                 state.parse_file()
 
             for fut in wait(futures).done:
                 fut.result()
-            for state in parallel_parsed_states:
-                # New parser returns serialized trees that need to be de-serialized.
-                with state.wrap_context():
-                    assert state.tree is not None
-                    if state.tree.raw_data:
-                        state.tree = load_from_raw(
-                            state.xpath,
-                            state.id,
-                            state.tree.raw_data,
-                            self.errors,
-                            state.options,
-                            imports_only=bool(self.workers),
-                        )
-                    if self.errors.is_blockers():
-                        self.log("Bailing due to parse errors")
-                        self.errors.raise_error()
 
-        for state in parallel_states:
-            assert state.tree is not None
-            if state in parallel_parsed_states_set:
-                state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
-                state.semantic_analysis_pass1()
-                self.ast_cache[state.id] = (state.tree, state.early_errors)
-            self.modules[state.id] = state.tree
-            state.check_blockers()
-            state.setup_errors()
+        return parallel_parsed_states, parallel_parsed_states_set
 
     def post_parse_all(self, states: list[State]) -> None:
         for state in states:
@@ -3090,7 +3119,6 @@ def get_source(self) -> str:
                 self.source_hash = compute_hash(source)
 
             self.parse_inline_configuration(source)
-            self.check_for_invalid_options()
 
             self.size_hint = len(source)
         self.time_spent_us += time_spent_us(t0)
@@ -3115,7 +3143,10 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
             # The file was already parsed.
             return
 
-        source = self.get_source()
+        if raw_data is None:
+            source = self.get_source()
+        else:
+            source = ""
         manager = self.manager
         # Can we reuse a previously parsed AST? This avoids redundant work in daemon.
         if self.id not in manager.ast_cache:
@@ -3125,6 +3156,12 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
                 self.manager.errors.ignored_files.add(self.xpath)
             with self.wrap_context():
                 manager.errors.set_file(self.xpath, self.id, options=self.options)
+                if raw_data is not None:
+                    # Apply inline mypy config before deserialization, since
+                    # some options (e.g. implicit_optional) affect how the
+                    # AST is built during deserialization.
+                    self.source_hash = raw_data.source_hash
+                    self.apply_inline_configuration(raw_data.mypy_comments)
                 self.parse_file_inner(source, raw_data)
                 assert self.tree is not None
                 # New parser returns serialized trees that need to be de-serialized.
@@ -3149,14 +3186,15 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
         else:
             # Reuse a cached AST
             manager.log(f"Using cached AST for {self.xpath} ({self.id})")
-            self.tree, self.early_errors = manager.ast_cache[self.id]
+            self.tree, self.early_errors, source_hash = manager.ast_cache[self.id]
+            self.source_hash = source_hash
 
         assert self.tree is not None
         if not temporary:
             manager.modules[self.id] = self.tree
             self.check_blockers()
 
-        manager.ast_cache[self.id] = (self.tree, self.early_errors)
+        manager.ast_cache[self.id] = (self.tree, self.early_errors, self.source_hash)
         self.setup_errors()
 
     def setup_errors(self) -> None:
@@ -3169,12 +3207,17 @@ def setup_errors(self) -> None:
     def parse_inline_configuration(self, source: str) -> None:
         """Check for inline mypy: options directive and parse them."""
         flags = get_mypy_comments(source)
+        self.apply_inline_configuration(flags)
+
+    def apply_inline_configuration(self, flags: list[tuple[int, str]] | None) -> None:
+        """Apply inline mypy configuration comments and check for invalid options."""
         if flags:
             changes, config_errors = parse_mypy_comments(flags, self.options)
             self.options = self.options.apply_changes(changes)
             self.manager.errors.set_file(self.xpath, self.id, self.options)
             for lineno, error in config_errors:
                 self.manager.error(lineno, error)
+        self.check_for_invalid_options()
 
     def check_for_invalid_options(self) -> None:
         if self.options.mypyc and not self.options.strict_bytes:
diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py
@@ -210,13 +210,27 @@ def native_parse(
         node.path = filename
         return node, [], []
 
-    b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = (
-        parse_to_binary_ast(filename, options, skip_function_bodies)
-    )
+    (
+        b,
+        errors,
+        ignores,
+        import_bytes,
+        is_partial_package,
+        uses_template_strings,
+        source_hash,
+        mypy_comments,
+    ) = parse_to_binary_ast(filename, options, skip_function_bodies)
     node = MypyFile([], [])
     node.path = filename
     node.raw_data = FileRawData(
-        b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings
+        b,
+        import_bytes,
+        errors,
+        dict(ignores),
+        is_partial_package,
+        uses_template_strings,
+        source_hash,
+        mypy_comments,
     )
     return node, errors, ignores
 
@@ -243,7 +257,7 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:
 
 def parse_to_binary_ast(
     filename: str, options: Options, skip_function_bodies: bool = False
-) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool]:
+) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
     # This is a horrible hack to work around a mypyc bug where imported
     # module may be not ready in a thread sometimes.
     t0 = time.time()
@@ -267,6 +281,8 @@ def parse_to_binary_ast(
         import_bytes,
         ast_data["is_partial_package"],
         ast_data["uses_template_strings"],
+        ast_data["source_hash"],
+        ast_data["mypy_comments"],
     )
 
 
diff --git a/mypy/nodes.py b/mypy/nodes.py
@@ -360,6 +360,8 @@ class FileRawData:
         "ignored_lines",
         "is_partial_stub_package",
         "uses_template_strings",
+        "source_hash",
+        "mypy_comments",
     )
 
     defs: bytes
@@ -368,6 +370,8 @@ class FileRawData:
     ignored_lines: dict[int, list[str]]
     is_partial_stub_package: bool
     uses_template_strings: bool
+    source_hash: str
+    mypy_comments: list[tuple[int, str]]
 
     def __init__(
         self,
@@ -377,13 +381,17 @@ def __init__(
         ignored_lines: dict[int, list[str]],
         is_partial_stub_package: bool,
         uses_template_strings: bool,
+        source_hash: str = "",
+        mypy_comments: list[tuple[int, str]] | None = None,
     ) -> None:
         self.defs = defs
         self.imports = imports
         self.raw_errors = raw_errors
         self.ignored_lines = ignored_lines
         self.is_partial_stub_package = is_partial_stub_package
         self.uses_template_strings = uses_template_strings
+        self.source_hash = source_hash
+        self.mypy_comments = mypy_comments if mypy_comments is not None else []
 
     def write(self, data: WriteBuffer) -> None:
         write_bytes(data, self.defs)
@@ -399,6 +407,12 @@ def write(self, data: WriteBuffer) -> None:
             write_str_list(data, codes)
         write_bool(data, self.is_partial_stub_package)
         write_bool(data, self.uses_template_strings)
+        write_str(data, self.source_hash)
+        write_tag(data, LIST_GEN)
+        write_int_bare(data, len(self.mypy_comments))
+        for line, text in self.mypy_comments:
+            write_int(data, line)
+            write_str(data, text)
 
     @classmethod
     def read(cls, data: ReadBuffer) -> FileRawData:
@@ -408,8 +422,20 @@ def read(cls, data: ReadBuffer) -> FileRawData:
         raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))]
         assert read_tag(data) == DICT_INT_GEN
         ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))}
+        is_partial_stub_package = read_bool(data)
+        uses_template_strings = read_bool(data)
+        source_hash = read_str(data)
+        assert read_tag(data) == LIST_GEN
+        mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))]
         return FileRawData(
-            defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data)
+            defs,
+            imports,
+            raw_errors,
+            ignored_lines,
+            is_partial_stub_package,
+            uses_template_strings,
+            source_hash,
+            mypy_comments,
         )
 
 
diff --git a/mypy/parse.py b/mypy/parse.py
@@ -64,7 +64,11 @@ def load_from_raw(
     options: Options,
     imports_only: bool = False,
 ) -> MypyFile:
-    """Load AST from parsed binary data and report stored errors."""
+    """Load AST from parsed binary data and report stored errors.
+
+    If imports_only is true, only deserialize imports and return a mostly
+    empty AST.
+    """
     from mypy.nativeparse import State, deserialize_imports, read_statements
 
     state = State(options)
diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py
@@ -251,7 +251,7 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->
             ]
 
         with temp_source("print('hello')") as fnam:
-            b, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
+            b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
             assert list(b) == (
                 [LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR]
                 + [nodes.NAME_EXPR, LITERAL_STR]
diff --git a/test-data/unit/check-optional.test b/test-data/unit/check-optional.test
@@ -1356,3 +1356,9 @@ def f(x: object) -> None:
         with C():
             pass
 [builtins fixtures/tuple.pyi]
+
+[case testInferOptionalFromDefaultNoneInlineConfig]
+# mypy: implicit-optional
+def f(x: int = None) -> None:
+    reveal_type(x)  # N: Revealed type is "builtins.int | None"
+f(None)

Original file line number	Diff line number	Diff line change
`@@ -251,7 +251,7 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->`
`251`	`251`	`]`
`252`	`252`
`253`	`253`	`with temp_source("print('hello')") as fnam:`
`254`		`- b, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())`
	`254`	`+ b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())`
`255`	`255`	`assert list(b) == (`
`256`	`256`	`[LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR]`
`257`	`257`	`+ [nodes.NAME_EXPR, LITERAL_STR]`