Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b3a7f89

Browse files
authored
Implement a sqlite-based store for the cache (#6023)
File system operations on OS X are pretty slow, and untarring a large archive of mypy cache information can get pretty slow. Work around this by using a sqlite database to store the entire cache in one file. To do this we introduce a generic interface for storing metadata, called `MetadataStore`. It presents an essentially key/value interface. We provide two implementations, one using the existing file system backing and one using sqlite. It is enabled with the option `--sqlite-cache`, but is not the default yet. I'm not sure what the right thing to do about testing is. I've tested it with the default changed, and everything passes.
1 parent eefb35b commit b3a7f89

8 files changed

Lines changed: 306 additions & 54 deletions

File tree

misc/convert-cache.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/env python3
2+
"""Script for converting between cache formats.
3+
4+
We support a filesystem tree based cache and a sqlite based cache.
5+
See mypy/metastore.py for details.
6+
"""
7+
8+
import sys
9+
import os
10+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11+
12+
import argparse
13+
from typing import Any
14+
from mypy.metastore import FilesystemMetadataStore, SqliteMetadataStore
15+
16+
17+
def main() -> None:
18+
parser = argparse.ArgumentParser()
19+
parser.add_argument('--to-sqlite', action='store_true', default=False,
20+
help='Convert to a sqlite cache (default: convert from)')
21+
parser.add_argument('--output_dir', action='store', default=None,
22+
help="Output cache location (default: same as input)")
23+
parser.add_argument('input_dir',
24+
help="Input directory for the cache")
25+
args parser.parse_args()
26+
27+
input_dir = args.input_dir
28+
output_dir = args.output_dir or input_dir
29+
if args.to_sqlite:
30+
input, output = FilesystemMetadataStore(input_dir), SqliteMetadataStore(output_dir)
31+
else:
32+
input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir)
33+
34+
for s in input.list_all():
35+
if s.endswith('.json'):
36+
assert output.write(s, input.read(s), input.getmtime(s)), "Failed to write cache file!"
37+
output.commit()
38+
39+
40+
if __name__ == '__main__':
41+
main()

mypy/build.py

Lines changed: 38 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
from mypy.plugins.default import DefaultPlugin
5353
from mypy.server.deps import get_dependencies
5454
from mypy.fscache import FileSystemCache
55+
from mypy.metastore import MetadataStore, FilesystemMetadataStore, SqliteMetadataStore
5556
from mypy.typestate import TypeState, reset_global_state
5657

5758
from mypy.mypyc_hacks import BuildManagerBase
@@ -208,6 +209,7 @@ def _build(sources: List[BuildSource],
208209
TypeState.reset_all_subtype_caches()
209210
return BuildResult(manager, graph)
210211
finally:
212+
manager.metastore.commit()
211213
manager.log("Build finished in %.3f seconds with %d modules, and %d errors" %
212214
(time.time() - manager.start_time,
213215
len(manager.modules),
@@ -506,6 +508,10 @@ def __init__(self, data_dir: str,
506508
not options.fine_grained_incremental or options.use_fine_grained_cache)
507509
self.fscache = fscache
508510
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
511+
if options.sqlite_cache:
512+
self.metastore = SqliteMetadataStore(_cache_dir_prefix(self)) # type: MetadataStore
513+
else:
514+
self.metastore = FilesystemMetadataStore(_cache_dir_prefix(self))
509515

510516
# a mapping from source files to their corresponding shadow files
511517
# for efficient lookup
@@ -551,7 +557,7 @@ def getmtime(self, path: str) -> int:
551557
if self.options.bazel:
552558
return 0
553559
else:
554-
return int(os.path.getmtime(path))
560+
return int(self.metastore.getmtime(path))
555561

556562
def normpath(self, path: str) -> str:
557563
"""Convert path to absolute; but to relative in bazel mode.
@@ -691,7 +697,8 @@ def write_protocol_deps_cache(proto_deps: Dict[str, Set[str]],
691697
(i.e. <SuperProto[wildcard]>, <Proto[wildcard]> -> <Proto>) are written to the normal
692698
per-file fine grained dependency caches.
693699
"""
694-
proto_meta, proto_cache = get_protocol_deps_cache_name(manager)
700+
metastore = manager.metastore
701+
proto_meta, proto_cache = get_protocol_deps_cache_name()
695702
meta_snapshot = {} # type: Dict[str, str]
696703
error = False
697704
for id, st in graph.items():
@@ -704,11 +711,11 @@ def write_protocol_deps_cache(proto_deps: Dict[str, Set[str]],
704711
assert st.meta, "Module must be either parsed or cached"
705712
meta_snapshot[id] = st.meta.hash
706713

707-
if not atomic_write(proto_meta, json.dumps(meta_snapshot), '\n'):
714+
if not metastore.write(proto_meta, json.dumps(meta_snapshot)):
708715
manager.log("Error writing protocol meta JSON file {}".format(proto_cache))
709716
error = True
710717
listed_proto_deps = {k: list(v) for (k, v) in proto_deps.items()}
711-
if not atomic_write(proto_cache, json.dumps(listed_proto_deps), '\n'):
718+
if not metastore.write(proto_cache, json.dumps(listed_proto_deps)):
712719
manager.log("Error writing protocol deps JSON file {}".format(proto_cache))
713720
error = True
714721
if error:
@@ -717,19 +724,20 @@ def write_protocol_deps_cache(proto_deps: Dict[str, Set[str]],
717724
blocker=True)
718725

719726

727+
PLUGIN_SNAPSHOT_FILE = '@plugins_snapshot.json' # type: Final
728+
729+
720730
def write_plugins_snapshot(manager: BuildManager) -> None:
721731
"""Write snapshot of versions and hashes of currently active plugins."""
722-
name = os.path.join(_cache_dir_prefix(manager), '@plugins_snapshot.json')
723-
if not atomic_write(name, json.dumps(manager.plugins_snapshot), '\n'):
732+
if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, json.dumps(manager.plugins_snapshot)):
724733
manager.errors.set_file(_cache_dir_prefix(manager), None)
725734
manager.errors.report(0, 0, "Error writing plugins snapshot",
726735
blocker=True)
727736

728737

729738
def read_plugins_snapshot(manager: BuildManager) -> Optional[Dict[str, str]]:
730739
"""Read cached snapshot of versions and hashes of plugins from previous run."""
731-
name = os.path.join(_cache_dir_prefix(manager), '@plugins_snapshot.json')
732-
snapshot = _load_json_file(name, manager,
740+
snapshot = _load_json_file(PLUGIN_SNAPSHOT_FILE, manager,
733741
log_sucess='Plugins snapshot ',
734742
log_error='Could not load plugins snapshot: ')
735743
if snapshot is None:
@@ -748,7 +756,7 @@ def read_protocol_cache(manager: BuildManager,
748756
See docstring for write_protocol_cache for details about which kinds of
749757
dependencies are read.
750758
"""
751-
proto_meta, proto_cache = get_protocol_deps_cache_name(manager)
759+
proto_meta, proto_cache = get_protocol_deps_cache_name()
752760
meta_snapshot = _load_json_file(proto_meta, manager,
753761
log_sucess='Proto meta ',
754762
log_error='Could not load protocol metadata: ')
@@ -781,8 +789,7 @@ def _load_json_file(file: str, manager: BuildManager,
781789
log_sucess: str, log_error: str) -> Optional[Dict[str, Any]]:
782790
"""A simple helper to read a JSON file with logging."""
783791
try:
784-
with open(file, 'r') as f:
785-
data = f.read()
792+
data = manager.metastore.read(file)
786793
except IOError:
787794
manager.log(log_error + file)
788795
return None
@@ -791,14 +798,12 @@ def _load_json_file(file: str, manager: BuildManager,
791798
return result
792799

793800

794-
def _cache_dir_prefix(manager: BuildManager, id: Optional[str] = None) -> str:
801+
def _cache_dir_prefix(manager: BuildManager) -> str:
795802
"""Get current cache directory (or file if id is given)."""
796803
cache_dir = manager.options.cache_dir
797804
pyversion = manager.options.python_version
798805
base = os.path.join(cache_dir, '%d.%d' % pyversion)
799-
if id is None:
800-
return base
801-
return os.path.join(base, *id.split('.'))
806+
return base
802807

803808

804809
def get_cache_names(id: str, path: str, manager: BuildManager) -> Tuple[str, str, Optional[str]]:
@@ -816,8 +821,14 @@ def get_cache_names(id: str, path: str, manager: BuildManager) -> Tuple[str, str
816821
"""
817822
pair = manager.options.cache_map.get(path)
818823
if pair is not None:
819-
return (pair[0], pair[1], None)
820-
prefix = _cache_dir_prefix(manager, id)
824+
# The cache map paths were specified relative to the base directory,
825+
# but the filesystem metastore APIs operates relative to the cache
826+
# prefix directory.
827+
# Solve this by rewriting the paths as relative to the root dir.
828+
# This only makes sense when using the filesystem backed cache.
829+
root = _cache_dir_prefix(manager)
830+
return (os.path.relpath(pair[0], root), os.path.relpath(pair[1], root), None)
831+
prefix = os.path.join(*id.split('.'))
821832
is_package = os.path.basename(path).startswith('__init__.py')
822833
if is_package:
823834
prefix = os.path.join(prefix, '__init__')
@@ -828,7 +839,7 @@ def get_cache_names(id: str, path: str, manager: BuildManager) -> Tuple[str, str
828839
return (prefix + '.meta.json', prefix + '.data.json', deps_json)
829840

830841

831-
def get_protocol_deps_cache_name(manager: BuildManager) -> Tuple[str, str]:
842+
def get_protocol_deps_cache_name() -> Tuple[str, str]:
832843
"""Return file names for fine grained protocol dependencies cache.
833844
834845
Since these dependencies represent a global state of the program, they
@@ -838,7 +849,7 @@ def get_protocol_deps_cache_name(manager: BuildManager) -> Tuple[str, str]:
838849
contains hashes of all source files at the time the protocol dependencies
839850
were written, and data file contains the protocol dependencies.
840851
"""
841-
name = os.path.join(_cache_dir_prefix(manager), '@proto_deps')
852+
name = '@proto_deps'
842853
return name + '.meta.json', name + '.data.json'
843854

844855

@@ -914,23 +925,6 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> Optional[Cache
914925
return m
915926

916927

917-
def random_string() -> str:
918-
return binascii.hexlify(os.urandom(8)).decode('ascii')
919-
920-
921-
def atomic_write(filename: str, line1: str, line2: str) -> bool:
922-
lines = [line1, line2]
923-
tmp_filename = filename + '.' + random_string()
924-
try:
925-
with open(tmp_filename, 'w') as f:
926-
for line in lines:
927-
f.write(line)
928-
os.replace(tmp_filename, filename)
929-
except os.error:
930-
return False
931-
return True
932-
933-
934928
def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
935929
ignore_all: bool, manager: BuildManager) -> Optional[CacheMeta]:
936930
'''Checks whether the cached AST of this module can be used.
@@ -1044,7 +1038,7 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str],
10441038
meta_json, _, _ = get_cache_names(id, path, manager)
10451039
manager.log('Updating mtime for {}: file {}, meta {}, mtime {}'
10461040
.format(id, path, meta_json, meta.mtime))
1047-
atomic_write(meta_json, meta_str, '\n') # Ignore errors, it's just an optimization.
1041+
manager.metastore.write(meta_json, meta_str) # Ignore errors, just an optimization.
10481042
return meta
10491043

10501044
# It's a match on (id, path, size, hash, mtime).
@@ -1097,6 +1091,7 @@ def write_cache(id: str, path: str, tree: MypyFile,
10971091
corresponding to the metadata that was written (the latter may
10981092
be None if the cache could not be written).
10991093
"""
1094+
metastore = manager.metastore
11001095
# For Bazel we use relative paths and zero mtimes.
11011096
bazel = manager.options.bazel
11021097

@@ -1111,19 +1106,13 @@ def write_cache(id: str, path: str, tree: MypyFile,
11111106
if bazel:
11121107
tree.path = path
11131108

1114-
# Make sure directory for cache files exists
1115-
parent = os.path.dirname(data_json)
1116-
assert os.path.dirname(meta_json) == parent
1117-
11181109
# Serialize data and analyze interface
11191110
data = tree.serialize()
11201111
data_str = json_dumps(data, manager.options.debug_cache)
11211112
interface_hash = compute_hash(data_str)
11221113

11231114
# Obtain and set up metadata
11241115
try:
1125-
if parent:
1126-
os.makedirs(parent, exist_ok=True)
11271116
st = manager.get_stat(path)
11281117
except OSError as err:
11291118
manager.log("Cannot get stat for {}: {}".format(path, err))
@@ -1146,7 +1135,7 @@ def write_cache(id: str, path: str, tree: MypyFile,
11461135
manager.trace("Interface for {} is unchanged".format(id))
11471136
else:
11481137
manager.trace("Interface for {} has changed".format(id))
1149-
if not atomic_write(data_json, data_str, '\n'):
1138+
if not metastore.write(data_json, data_str):
11501139
# Most likely the error is the replace() call
11511140
# (see https://github.com/python/mypy/issues/3215).
11521141
manager.log("Error writing data JSON file {}".format(data_json))
@@ -1164,7 +1153,7 @@ def write_cache(id: str, path: str, tree: MypyFile,
11641153
deps_mtime = None
11651154
if deps_json:
11661155
deps_str = json_dumps(serialized_fine_grained_deps, manager.options.debug_cache)
1167-
if not atomic_write(deps_json, deps_str, '\n'):
1156+
if not metastore.write(deps_json, deps_str):
11681157
manager.log("Error writing deps JSON file {}".format(deps_json))
11691158
return interface_hash, None
11701159
deps_mtime = manager.getmtime(deps_json)
@@ -1193,7 +1182,7 @@ def write_cache(id: str, path: str, tree: MypyFile,
11931182

11941183
# Write meta cache file
11951184
meta_str = json_dumps(meta, manager.options.debug_cache)
1196-
if not atomic_write(meta_json, meta_str, '\n'):
1185+
if not metastore.write(meta_json, meta_str):
11971186
# Most likely the error is the replace() call
11981187
# (see https://github.com/python/mypy/issues/3215).
11991188
# The next run will simply find the cache entry out of date.
@@ -1216,7 +1205,7 @@ def delete_cache(id: str, path: str, manager: BuildManager) -> None:
12161205
for filename in cache_paths:
12171206
try:
12181207
if filename:
1219-
os.remove(filename)
1208+
manager.metastore.remove(filename)
12201209
except OSError as e:
12211210
if e.errno != errno.ENOENT:
12221211
manager.log("Error deleting cache file {}: {}".format(filename, e.strerror))
@@ -1590,16 +1579,14 @@ def load_fine_grained_deps(self) -> None:
15901579
assert self.meta is not None, "Internal error: this method must be called only" \
15911580
" for cached modules"
15921581
assert self.meta.deps_json
1593-
with open(self.meta.deps_json) as f:
1594-
deps = json.load(f)
1582+
deps = json.loads(self.manager.metastore.read(self.meta.deps_json))
15951583
# TODO: Assert deps file wasn't changed.
15961584
self.fine_grained_deps = {k: set(v) for k, v in deps.items()}
15971585

15981586
def load_tree(self, temporary: bool = False) -> None:
15991587
assert self.meta is not None, "Internal error: this method must be called only" \
16001588
" for cached modules"
1601-
with open(self.meta.data_json) as f:
1602-
data = json.load(f)
1589+
data = json.loads(self.manager.metastore.read(self.meta.data_json))
16031590
# TODO: Assert data file wasn't changed.
16041591
self.tree = MypyFile.deserialize(data)
16051592
if not temporary:

mypy/main.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,9 @@ def add_invertible_flag(flag: str,
570570
'--cache-dir', action='store', metavar='DIR',
571571
help="Store module cache info in the given folder in incremental mode "
572572
"(defaults to '{}')".format(defaults.CACHE_DIR))
573+
add_invertible_flag('--sqlite-cache', default=False,
574+
help="Use a sqlite database to store the cache",
575+
group=incremental_group)
573576
incremental_group.add_argument(
574577
'--cache-fine-grained', action='store_true',
575578
help="Include fine-grained dependency information in the cache for the mypy daemon")
@@ -793,6 +796,9 @@ def add_invertible_flag(flag: str,
793796

794797
# Process --cache-map.
795798
if special_opts.cache_map:
799+
if options.sqlite_cache:
800+
parser.error("--cache-map is incompatible with --sqlite-cache")
801+
796802
process_cache_map(parser, special_opts, options)
797803

798804
# Let quick_and_dirty imply incremental.

0 commit comments

Comments
 (0)