Use sharding for sqlite cache (16 shards) (#21292)

JukkaL · web-flow · commit ee9c9e1ac9a8 · 2026-04-23T11:43:58.000+01:00
SQLite writes can become a major bottleneck for parallel runs, since only one write can be active at any time. Sharding helps a lot and was pretty easy to implement and reason about. We need to be a bit careful to not have transactions that span multiple shards, as it might cause deadlocks. Sharding is based on path name without file name extension(s), so cache data for a single module goes to the same shard always. Use a predictable string hash function that is tuned for mypyc. It's much faster than say SHA-1 (though hashing probably isn't a huge bottleneck). A version of this with 8 shards was on the order of 20% faster in some cases when using 8 workers. The impact was bigger on macOS, but Linux was also better (at least on a cloud VM). Before merging, I'll run some benchmarks to validate that 16 shards don't regress anything. Used coding agent assist here, but did things here in small reviewed increments. Also updated the cache conversion and diff scripts (tested manually using coding agent). Related to #21215.
diff --git a/misc/apply-cache-diff.py b/misc/apply-cache-diff.py
@@ -19,19 +19,22 @@
 from librt.internal import ReadBuffer
 
 from mypy.cache import CacheMeta
+from mypy.defaults import SQLITE_NUM_SHARDS
 from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
 from mypy.util import json_dumps, json_loads
 
 
-def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
+def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
     if sqlite:
-        return SqliteMetadataStore(input_dir)
+        return SqliteMetadataStore(input_dir, num_shards=num_shards)
     else:
         return FilesystemMetadataStore(input_dir)
 
 
-def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
-    cache = make_cache(cache_dir, sqlite)
+def apply_diff(
+    cache_dir: str, diff_file: str, sqlite: bool = False, num_shards: int = SQLITE_NUM_SHARDS
+) -> None:
+    cache = make_cache(cache_dir, sqlite, num_shards=num_shards)
     with open(diff_file, "rb") as f:
         diff = json_loads(f.read())
 
@@ -63,11 +66,14 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
+    parser.add_argument(
+        "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
+    )
     parser.add_argument("cache_dir", help="Directory for the cache")
     parser.add_argument("diff", help="Cache diff file")
     args = parser.parse_args()
 
-    apply_diff(args.cache_dir, args.diff, args.sqlite)
+    apply_diff(args.cache_dir, args.diff, args.sqlite, num_shards=args.num_shards)
 
 
 if __name__ == "__main__":
diff --git a/misc/convert-cache.py b/misc/convert-cache.py
@@ -15,6 +15,7 @@
 
 import argparse
 
+from mypy.defaults import SQLITE_NUM_SHARDS
 from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
 
 
@@ -26,6 +27,13 @@ def main() -> None:
         default=False,
         help="Convert to a sqlite cache (default: convert from)",
     )
+    parser.add_argument(
+        "--num-shards",
+        type=int,
+        default=SQLITE_NUM_SHARDS,
+        dest="num_shards",
+        help=argparse.SUPPRESS,
+    )
     parser.add_argument(
         "--output_dir",
         action="store",
@@ -37,17 +45,23 @@ def main() -> None:
 
     input_dir = args.input_dir
     output_dir = args.output_dir or input_dir
+    num_shards = args.num_shards
     assert os.path.isdir(output_dir), f"{output_dir} is not a directory"
     if args.to_sqlite:
         input: MetadataStore = FilesystemMetadataStore(input_dir)
-        output: MetadataStore = SqliteMetadataStore(output_dir)
+        output: MetadataStore = SqliteMetadataStore(output_dir, num_shards=num_shards)
     else:
-        fnam = os.path.join(input_dir, "cache.db")
-        msg = f"{fnam} does not exist"
-        if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
-            msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
-        assert os.path.isfile(fnam), msg
-        input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir)
+        if num_shards <= 1:
+            db_files = [os.path.join(input_dir, "cache.db")]
+        else:
+            db_files = [os.path.join(input_dir, f"cache.{i}.db") for i in range(num_shards)]
+        for fnam in db_files:
+            msg = f"{fnam} does not exist"
+            if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
+                msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
+            assert os.path.isfile(fnam), msg
+        input = SqliteMetadataStore(input_dir, num_shards=num_shards)
+        output = FilesystemMetadataStore(output_dir)
 
     for s in input.list_all():
         if s.endswith((".json", ".ff")):
diff --git a/misc/diff-cache.py b/misc/diff-cache.py
@@ -19,13 +19,14 @@
 from librt.internal import ReadBuffer, WriteBuffer
 
 from mypy.cache import CacheMeta, CacheMetaEx
+from mypy.defaults import SQLITE_NUM_SHARDS
 from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
 from mypy.util import json_dumps, json_loads
 
 
-def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
+def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
     if sqlite:
-        return SqliteMetadataStore(input_dir)
+        return SqliteMetadataStore(input_dir, num_shards=num_shards)
     else:
         return FilesystemMetadataStore(input_dir)
 
@@ -154,13 +155,16 @@ def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity")
     parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
+    parser.add_argument(
+        "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
+    )
     parser.add_argument("input_dir1", help="Input directory for the original cache")
     parser.add_argument("input_dir2", help="Input directory for the target cache")
     parser.add_argument("output", help="Output file with the diff from original cache")
     args = parser.parse_args()
 
-    cache1 = make_cache(args.input_dir1, args.sqlite)
-    cache2 = make_cache(args.input_dir2, args.sqlite)
+    cache1 = make_cache(args.input_dir1, args.sqlite, num_shards=args.num_shards)
+    cache2 = make_cache(args.input_dir2, args.sqlite, num_shards=args.num_shards)
 
     type_misses: dict[str, int] = defaultdict(int)
     type_hits: dict[str, int] = defaultdict(int)
diff --git a/mypy/build.py b/mypy/build.py
@@ -1308,6 +1308,10 @@ def commit(self) -> None:
         self.metastore.commit()
         self.add_stats(cache_commit_time=time.time() - t0)
 
+    def commit_module(self, meta_file: str) -> None:
+        """Commit cache writes for a single module (identified by its meta file path)."""
+        self.metastore.commit_path(meta_file)
+
     def verbosity(self) -> int:
         return self.options.verbosity
 
@@ -1891,7 +1895,9 @@ def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore:
     """Create the appropriate metadata store."""
     if options.sqlite_cache:
         mds: MetadataStore = SqliteMetadataStore(
-            _cache_dir_prefix(options), set_journal_mode=not parallel_worker
+            _cache_dir_prefix(options),
+            set_journal_mode=not parallel_worker,
+            num_shards=options.sqlite_num_shards,
         )
     else:
         mds = FilesystemMetadataStore(_cache_dir_prefix(options))
@@ -4518,6 +4524,10 @@ def find_stale_sccs(
 
 def process_graph(graph: Graph, manager: BuildManager) -> None:
     """Process everything in dependency order."""
+    if manager.workers:
+        # Commit any cache writes from graph loading before workers try to read them.
+        manager.commit()
+
     # Broadcast graph to workers before computing SCCs to save a bit of time.
     # TODO: check if we can optimize by sending only part of the graph needed for given SCC.
     # For example only send modules in the SCC and their dependencies.
@@ -4769,6 +4779,8 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
 
     t4 = time.time()
     # Flush errors, and write cache in two phases: first data files, then meta files.
+    # The two-phase structure is needed because meta.dep_hashes references interface_hash
+    # values from other modules in the SCC, which are updated by write_cache().
     meta_tuples = {}
     errors_by_id = {}
     for id in stale:
@@ -4779,7 +4791,11 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
             )
             manager.flush_errors(manager.errors.simplify_path(graph[id].xpath), formatted, False)
             errors_by_id[id] = errors
-        meta_tuples[id] = graph[id].write_cache()
+        meta_tuple = graph[id].write_cache()
+        meta_tuples[id] = meta_tuple
+        # Commit data file write immediately to avoid holding shard locks across modules.
+        if meta_tuple is not None:
+            manager.commit_module(meta_tuple[1])
     for id in stale:
         meta_tuple = meta_tuples[id]
         if meta_tuple is None:
@@ -4803,6 +4819,7 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
             error_lines=errors_by_id.get(id, []),
         )
         write_cache_meta_ex(meta_file, meta_ex, manager)
+        manager.commit_module(meta_file)
     manager.done_sccs.add(ascc.id)
     manager.add_stats(
         load_missing_time=t1 - t0,
@@ -4855,6 +4872,9 @@ def process_stale_scc_interface(
     for id in stale:
         meta_tuple = graph[id].write_cache()
         meta_tuples[id] = meta_tuple
+        # Commit data file write immediately to avoid holding shard locks across modules.
+        if meta_tuple is not None:
+            manager.commit_module(meta_tuple[1])
     for id in stale:
         meta_tuple = meta_tuples[id]
         if meta_tuple is None:
@@ -4867,6 +4887,7 @@ def process_stale_scc_interface(
             if state.priorities.get(dep) != PRI_INDIRECT
         ]
         write_cache_meta(meta, manager, meta_file)
+        manager.commit_module(meta_file)
         scc_result.append((id, ModuleResult(graph[id].interface_hash.hex(), []), meta_file))
     manager.done_sccs.add(ascc.id)
     manager.add_stats(
@@ -4946,6 +4967,7 @@ def process_stale_scc_implementation(
             # If there are no errors, only write the cache, don't send anything back
             # to the caller (as a micro-optimization).
             write_cache_meta_ex(meta_file, meta_ex, manager)
+        manager.commit_module(meta_file)
 
     manager.add_stats(type_check_time_implementation=time.time() - t0)
     return scc_result
diff --git a/mypy/defaults.py b/mypy/defaults.py
@@ -14,6 +14,7 @@
 PYTHON3_VERSION_MIN: Final = (3, 10)  # Keep in sync with supported target versions
 
 CACHE_DIR: Final = ".mypy_cache"
+SQLITE_NUM_SHARDS: Final = 16
 
 CONFIG_NAMES: Final = ["mypy.ini", ".mypy.ini"]
 SHARED_CONFIG_NAMES: Final = ["pyproject.toml", "setup.cfg"]
diff --git a/mypy/main.py b/mypy/main.py
@@ -1077,6 +1077,13 @@ def add_invertible_flag(
         help="Use a sqlite database to store the cache",
         group=incremental_group,
     )
+    incremental_group.add_argument(
+        "--sqlite-num-shards",
+        type=int,
+        default=defaults.SQLITE_NUM_SHARDS,
+        dest="sqlite_num_shards",
+        help=argparse.SUPPRESS,
+    )
     incremental_group.add_argument(
         "--cache-fine-grained",
         action="store_true",
diff --git a/mypy/metastore.py b/mypy/metastore.py
@@ -17,7 +17,7 @@
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any
 
-from mypy.util import os_path_join
+from mypy.util import hash_path_stem, os_path_join
 
 if TYPE_CHECKING:
     # We avoid importing sqlite3 unless we are using it so we can mostly work
@@ -65,6 +65,14 @@ def commit(self) -> None:
         called.
         """
 
+    def commit_path(self, name: str) -> None:
+        """Commit changes related to a specific cache path.
+
+        For sharded stores, this commits only the shard containing the path.
+        Default implementation commits everything.
+        """
+        self.commit()
+
     @abstractmethod
     def list_all(self) -> Iterable[str]: ...
 
@@ -169,23 +177,43 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection:
 
 
 class SqliteMetadataStore(MetadataStore):
-    def __init__(self, cache_dir_prefix: str, set_journal_mode: bool = False) -> None:
+    def __init__(
+        self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1
+    ) -> None:
         # We check startswith instead of equality because the version
         # will have already been appended by the time the cache dir is
         # passed here.
-        self.db = None
+        self.dbs: list[sqlite3.Connection] = []
+        self.num_shards = num_shards
+        self.dirty_shards: set[int] = set()
         if cache_dir_prefix.startswith(os.devnull):
             return
 
         os.makedirs(cache_dir_prefix, exist_ok=True)
-        self.db = connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
+        if num_shards <= 1:
+            self.dbs.append(
+                connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
+            )
+        else:
+            for i in range(num_shards):
+                self.dbs.append(
+                    connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode)
+                )
+
+    def _shard_index(self, name: str) -> int:
+        if self.num_shards <= 1:
+            return 0
+        return hash_path_stem(name) % self.num_shards
+
+    def _db_for(self, name: str) -> sqlite3.Connection:
+        if not self.dbs:
+            raise FileNotFoundError()
+        return self.dbs[self._shard_index(name)]
 
     def _query(self, name: str, field: str) -> Any:
         # Raises FileNotFound for consistency with the file system version
-        if not self.db:
-            raise FileNotFoundError()
-
-        cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
+        db = self._db_for(name)
+        cur = db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
         results = cur.fetchall()
         if not results:
             raise FileNotFoundError()
@@ -205,39 +233,46 @@ def read(self, name: str) -> bytes:
     def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
         import sqlite3
 
-        if not self.db:
+        if not self.dbs:
             return False
         try:
             if mtime is None:
                 mtime = time.time()
-            self.db.execute(
+            db = self._db_for(name)
+            db.execute(
                 "INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
                 (name, mtime, data),
             )
+            self.dirty_shards.add(self._shard_index(name))
         except sqlite3.OperationalError:
             return False
         return True
 
     def remove(self, name: str) -> None:
-        if not self.db:
-            raise FileNotFoundError()
-
-        self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
+        db = self._db_for(name)
+        db.execute("DELETE FROM files2 WHERE path = ?", (name,))
+        self.dirty_shards.add(self._shard_index(name))
 
     def commit(self) -> None:
-        if self.db:
-            self.db.commit()
+        for i in self.dirty_shards:
+            self.dbs[i].commit()
+        self.dirty_shards.clear()
+
+    def commit_path(self, name: str) -> None:
+        i = self._shard_index(name)
+        if i in self.dirty_shards:
+            self.dbs[i].commit()
+            self.dirty_shards.discard(i)
 
     def list_all(self) -> Iterable[str]:
-        if self.db:
-            for row in self.db.execute("SELECT path FROM files2"):
+        for db in self.dbs:
+            for row in db.execute("SELECT path FROM files2"):
                 yield row[0]
 
     def close(self) -> None:
-        if self.db:
-            db = self.db
-            self.db = None
+        for db in self.dbs:
             db.close()
+        self.dbs.clear()
 
     def __del__(self) -> None:
         self.close()
diff --git a/mypy/options.py b/mypy/options.py
@@ -302,6 +302,7 @@ def __init__(self) -> None:
         self.incremental = True
         self.cache_dir = defaults.CACHE_DIR
         self.sqlite_cache = True
+        self.sqlite_num_shards = defaults.SQLITE_NUM_SHARDS
         self.fixed_format_cache = True
         self.debug_cache = False
         self.skip_version_check = False
diff --git a/mypy/test/test_diff_cache.py b/mypy/test/test_diff_cache.py
diff --git a/mypy/util.py b/mypy/util.py