Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ee9c9e1

Browse files
authored
Use sharding for sqlite cache (16 shards) (#21292)
SQLite writes can become a major bottleneck for parallel runs, since only one write can be active at any time. Sharding helps a lot and was pretty easy to implement and reason about. We need to be a bit careful to not have transactions that span multiple shards, as it might cause deadlocks. Sharding is based on path name without file name extension(s), so cache data for a single module goes to the same shard always. Use a predictable string hash function that is tuned for mypyc. It's much faster than say SHA-1 (though hashing probably isn't a huge bottleneck). A version of this with 8 shards was on the order of 20% faster in some cases when using 8 workers. The impact was bigger on macOS, but Linux was also better (at least on a cloud VM). Before merging, I'll run some benchmarks to validate that 16 shards don't regress anything. Used coding agent assist here, but did things here in small reviewed increments. Also updated the cache conversion and diff scripts (tested manually using coding agent). Related to #21215.
1 parent db2faa7 commit ee9c9e1

10 files changed

Lines changed: 167 additions & 42 deletions

File tree

misc/apply-cache-diff.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,22 @@
1919
from librt.internal import ReadBuffer
2020

2121
from mypy.cache import CacheMeta
22+
from mypy.defaults import SQLITE_NUM_SHARDS
2223
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
2324
from mypy.util import json_dumps, json_loads
2425

2526

26-
def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
27+
def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
2728
if sqlite:
28-
return SqliteMetadataStore(input_dir)
29+
return SqliteMetadataStore(input_dir, num_shards=num_shards)
2930
else:
3031
return FilesystemMetadataStore(input_dir)
3132

3233

33-
def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
34-
cache = make_cache(cache_dir, sqlite)
34+
def apply_diff(
35+
cache_dir: str, diff_file: str, sqlite: bool = False, num_shards: int = SQLITE_NUM_SHARDS
36+
) -> None:
37+
cache = make_cache(cache_dir, sqlite, num_shards=num_shards)
3538
with open(diff_file, "rb") as f:
3639
diff = json_loads(f.read())
3740

@@ -63,11 +66,14 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
6366
def main() -> None:
6467
parser = argparse.ArgumentParser()
6568
parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
69+
parser.add_argument(
70+
"--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
71+
)
6672
parser.add_argument("cache_dir", help="Directory for the cache")
6773
parser.add_argument("diff", help="Cache diff file")
6874
args = parser.parse_args()
6975

70-
apply_diff(args.cache_dir, args.diff, args.sqlite)
76+
apply_diff(args.cache_dir, args.diff, args.sqlite, num_shards=args.num_shards)
7177

7278

7379
if __name__ == "__main__":

misc/convert-cache.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import argparse
1717

18+
from mypy.defaults import SQLITE_NUM_SHARDS
1819
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
1920

2021

@@ -26,6 +27,13 @@ def main() -> None:
2627
default=False,
2728
help="Convert to a sqlite cache (default: convert from)",
2829
)
30+
parser.add_argument(
31+
"--num-shards",
32+
type=int,
33+
default=SQLITE_NUM_SHARDS,
34+
dest="num_shards",
35+
help=argparse.SUPPRESS,
36+
)
2937
parser.add_argument(
3038
"--output_dir",
3139
action="store",
@@ -37,17 +45,23 @@ def main() -> None:
3745

3846
input_dir = args.input_dir
3947
output_dir = args.output_dir or input_dir
48+
num_shards = args.num_shards
4049
assert os.path.isdir(output_dir), f"{output_dir} is not a directory"
4150
if args.to_sqlite:
4251
input: MetadataStore = FilesystemMetadataStore(input_dir)
43-
output: MetadataStore = SqliteMetadataStore(output_dir)
52+
output: MetadataStore = SqliteMetadataStore(output_dir, num_shards=num_shards)
4453
else:
45-
fnam = os.path.join(input_dir, "cache.db")
46-
msg = f"{fnam} does not exist"
47-
if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
48-
msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
49-
assert os.path.isfile(fnam), msg
50-
input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir)
54+
if num_shards <= 1:
55+
db_files = [os.path.join(input_dir, "cache.db")]
56+
else:
57+
db_files = [os.path.join(input_dir, f"cache.{i}.db") for i in range(num_shards)]
58+
for fnam in db_files:
59+
msg = f"{fnam} does not exist"
60+
if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
61+
msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
62+
assert os.path.isfile(fnam), msg
63+
input = SqliteMetadataStore(input_dir, num_shards=num_shards)
64+
output = FilesystemMetadataStore(output_dir)
5165

5266
for s in input.list_all():
5367
if s.endswith((".json", ".ff")):

misc/diff-cache.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
from librt.internal import ReadBuffer, WriteBuffer
2020

2121
from mypy.cache import CacheMeta, CacheMetaEx
22+
from mypy.defaults import SQLITE_NUM_SHARDS
2223
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
2324
from mypy.util import json_dumps, json_loads
2425

2526

26-
def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
27+
def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
2728
if sqlite:
28-
return SqliteMetadataStore(input_dir)
29+
return SqliteMetadataStore(input_dir, num_shards=num_shards)
2930
else:
3031
return FilesystemMetadataStore(input_dir)
3132

@@ -154,13 +155,16 @@ def main() -> None:
154155
parser = argparse.ArgumentParser()
155156
parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity")
156157
parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
158+
parser.add_argument(
159+
"--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
160+
)
157161
parser.add_argument("input_dir1", help="Input directory for the original cache")
158162
parser.add_argument("input_dir2", help="Input directory for the target cache")
159163
parser.add_argument("output", help="Output file with the diff from original cache")
160164
args = parser.parse_args()
161165

162-
cache1 = make_cache(args.input_dir1, args.sqlite)
163-
cache2 = make_cache(args.input_dir2, args.sqlite)
166+
cache1 = make_cache(args.input_dir1, args.sqlite, num_shards=args.num_shards)
167+
cache2 = make_cache(args.input_dir2, args.sqlite, num_shards=args.num_shards)
164168

165169
type_misses: dict[str, int] = defaultdict(int)
166170
type_hits: dict[str, int] = defaultdict(int)

mypy/build.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,10 @@ def commit(self) -> None:
13081308
self.metastore.commit()
13091309
self.add_stats(cache_commit_time=time.time() - t0)
13101310

1311+
def commit_module(self, meta_file: str) -> None:
1312+
"""Commit cache writes for a single module (identified by its meta file path)."""
1313+
self.metastore.commit_path(meta_file)
1314+
13111315
def verbosity(self) -> int:
13121316
return self.options.verbosity
13131317

@@ -1891,7 +1895,9 @@ def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore:
18911895
"""Create the appropriate metadata store."""
18921896
if options.sqlite_cache:
18931897
mds: MetadataStore = SqliteMetadataStore(
1894-
_cache_dir_prefix(options), set_journal_mode=not parallel_worker
1898+
_cache_dir_prefix(options),
1899+
set_journal_mode=not parallel_worker,
1900+
num_shards=options.sqlite_num_shards,
18951901
)
18961902
else:
18971903
mds = FilesystemMetadataStore(_cache_dir_prefix(options))
@@ -4518,6 +4524,10 @@ def find_stale_sccs(
45184524

45194525
def process_graph(graph: Graph, manager: BuildManager) -> None:
45204526
"""Process everything in dependency order."""
4527+
if manager.workers:
4528+
# Commit any cache writes from graph loading before workers try to read them.
4529+
manager.commit()
4530+
45214531
# Broadcast graph to workers before computing SCCs to save a bit of time.
45224532
# TODO: check if we can optimize by sending only part of the graph needed for given SCC.
45234533
# For example only send modules in the SCC and their dependencies.
@@ -4769,6 +4779,8 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
47694779

47704780
t4 = time.time()
47714781
# Flush errors, and write cache in two phases: first data files, then meta files.
4782+
# The two-phase structure is needed because meta.dep_hashes references interface_hash
4783+
# values from other modules in the SCC, which are updated by write_cache().
47724784
meta_tuples = {}
47734785
errors_by_id = {}
47744786
for id in stale:
@@ -4779,7 +4791,11 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
47794791
)
47804792
manager.flush_errors(manager.errors.simplify_path(graph[id].xpath), formatted, False)
47814793
errors_by_id[id] = errors
4782-
meta_tuples[id] = graph[id].write_cache()
4794+
meta_tuple = graph[id].write_cache()
4795+
meta_tuples[id] = meta_tuple
4796+
# Commit data file write immediately to avoid holding shard locks across modules.
4797+
if meta_tuple is not None:
4798+
manager.commit_module(meta_tuple[1])
47834799
for id in stale:
47844800
meta_tuple = meta_tuples[id]
47854801
if meta_tuple is None:
@@ -4803,6 +4819,7 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
48034819
error_lines=errors_by_id.get(id, []),
48044820
)
48054821
write_cache_meta_ex(meta_file, meta_ex, manager)
4822+
manager.commit_module(meta_file)
48064823
manager.done_sccs.add(ascc.id)
48074824
manager.add_stats(
48084825
load_missing_time=t1 - t0,
@@ -4855,6 +4872,9 @@ def process_stale_scc_interface(
48554872
for id in stale:
48564873
meta_tuple = graph[id].write_cache()
48574874
meta_tuples[id] = meta_tuple
4875+
# Commit data file write immediately to avoid holding shard locks across modules.
4876+
if meta_tuple is not None:
4877+
manager.commit_module(meta_tuple[1])
48584878
for id in stale:
48594879
meta_tuple = meta_tuples[id]
48604880
if meta_tuple is None:
@@ -4867,6 +4887,7 @@ def process_stale_scc_interface(
48674887
if state.priorities.get(dep) != PRI_INDIRECT
48684888
]
48694889
write_cache_meta(meta, manager, meta_file)
4890+
manager.commit_module(meta_file)
48704891
scc_result.append((id, ModuleResult(graph[id].interface_hash.hex(), []), meta_file))
48714892
manager.done_sccs.add(ascc.id)
48724893
manager.add_stats(
@@ -4946,6 +4967,7 @@ def process_stale_scc_implementation(
49464967
# If there are no errors, only write the cache, don't send anything back
49474968
# to the caller (as a micro-optimization).
49484969
write_cache_meta_ex(meta_file, meta_ex, manager)
4970+
manager.commit_module(meta_file)
49494971

49504972
manager.add_stats(type_check_time_implementation=time.time() - t0)
49514973
return scc_result

mypy/defaults.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
PYTHON3_VERSION_MIN: Final = (3, 10) # Keep in sync with supported target versions
1515

1616
CACHE_DIR: Final = ".mypy_cache"
17+
SQLITE_NUM_SHARDS: Final = 16
1718

1819
CONFIG_NAMES: Final = ["mypy.ini", ".mypy.ini"]
1920
SHARED_CONFIG_NAMES: Final = ["pyproject.toml", "setup.cfg"]

mypy/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,6 +1077,13 @@ def add_invertible_flag(
10771077
help="Use a sqlite database to store the cache",
10781078
group=incremental_group,
10791079
)
1080+
incremental_group.add_argument(
1081+
"--sqlite-num-shards",
1082+
type=int,
1083+
default=defaults.SQLITE_NUM_SHARDS,
1084+
dest="sqlite_num_shards",
1085+
help=argparse.SUPPRESS,
1086+
)
10801087
incremental_group.add_argument(
10811088
"--cache-fine-grained",
10821089
action="store_true",

mypy/metastore.py

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from collections.abc import Iterable
1818
from typing import TYPE_CHECKING, Any
1919

20-
from mypy.util import os_path_join
20+
from mypy.util import hash_path_stem, os_path_join
2121

2222
if TYPE_CHECKING:
2323
# We avoid importing sqlite3 unless we are using it so we can mostly work
@@ -65,6 +65,14 @@ def commit(self) -> None:
6565
called.
6666
"""
6767

68+
def commit_path(self, name: str) -> None:
69+
"""Commit changes related to a specific cache path.
70+
71+
For sharded stores, this commits only the shard containing the path.
72+
Default implementation commits everything.
73+
"""
74+
self.commit()
75+
6876
@abstractmethod
6977
def list_all(self) -> Iterable[str]: ...
7078

@@ -169,23 +177,43 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection:
169177

170178

171179
class SqliteMetadataStore(MetadataStore):
172-
def __init__(self, cache_dir_prefix: str, set_journal_mode: bool = False) -> None:
180+
def __init__(
181+
self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1
182+
) -> None:
173183
# We check startswith instead of equality because the version
174184
# will have already been appended by the time the cache dir is
175185
# passed here.
176-
self.db = None
186+
self.dbs: list[sqlite3.Connection] = []
187+
self.num_shards = num_shards
188+
self.dirty_shards: set[int] = set()
177189
if cache_dir_prefix.startswith(os.devnull):
178190
return
179191

180192
os.makedirs(cache_dir_prefix, exist_ok=True)
181-
self.db = connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
193+
if num_shards <= 1:
194+
self.dbs.append(
195+
connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
196+
)
197+
else:
198+
for i in range(num_shards):
199+
self.dbs.append(
200+
connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode)
201+
)
202+
203+
def _shard_index(self, name: str) -> int:
204+
if self.num_shards <= 1:
205+
return 0
206+
return hash_path_stem(name) % self.num_shards
207+
208+
def _db_for(self, name: str) -> sqlite3.Connection:
209+
if not self.dbs:
210+
raise FileNotFoundError()
211+
return self.dbs[self._shard_index(name)]
182212

183213
def _query(self, name: str, field: str) -> Any:
184214
# Raises FileNotFound for consistency with the file system version
185-
if not self.db:
186-
raise FileNotFoundError()
187-
188-
cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
215+
db = self._db_for(name)
216+
cur = db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
189217
results = cur.fetchall()
190218
if not results:
191219
raise FileNotFoundError()
@@ -205,39 +233,46 @@ def read(self, name: str) -> bytes:
205233
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
206234
import sqlite3
207235

208-
if not self.db:
236+
if not self.dbs:
209237
return False
210238
try:
211239
if mtime is None:
212240
mtime = time.time()
213-
self.db.execute(
241+
db = self._db_for(name)
242+
db.execute(
214243
"INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
215244
(name, mtime, data),
216245
)
246+
self.dirty_shards.add(self._shard_index(name))
217247
except sqlite3.OperationalError:
218248
return False
219249
return True
220250

221251
def remove(self, name: str) -> None:
222-
if not self.db:
223-
raise FileNotFoundError()
224-
225-
self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
252+
db = self._db_for(name)
253+
db.execute("DELETE FROM files2 WHERE path = ?", (name,))
254+
self.dirty_shards.add(self._shard_index(name))
226255

227256
def commit(self) -> None:
228-
if self.db:
229-
self.db.commit()
257+
for i in self.dirty_shards:
258+
self.dbs[i].commit()
259+
self.dirty_shards.clear()
260+
261+
def commit_path(self, name: str) -> None:
262+
i = self._shard_index(name)
263+
if i in self.dirty_shards:
264+
self.dbs[i].commit()
265+
self.dirty_shards.discard(i)
230266

231267
def list_all(self) -> Iterable[str]:
232-
if self.db:
233-
for row in self.db.execute("SELECT path FROM files2"):
268+
for db in self.dbs:
269+
for row in db.execute("SELECT path FROM files2"):
234270
yield row[0]
235271

236272
def close(self) -> None:
237-
if self.db:
238-
db = self.db
239-
self.db = None
273+
for db in self.dbs:
240274
db.close()
275+
self.dbs.clear()
241276

242277
def __del__(self) -> None:
243278
self.close()

mypy/options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ def __init__(self) -> None:
302302
self.incremental = True
303303
self.cache_dir = defaults.CACHE_DIR
304304
self.sqlite_cache = True
305+
self.sqlite_num_shards = defaults.SQLITE_NUM_SHARDS
305306
self.fixed_format_cache = True
306307
self.debug_cache = False
307308
self.skip_version_check = False

0 commit comments

Comments
 (0)