#!/usr/bin/env python3 """Produce a diff between mypy caches. With some infrastructure, this can allow for distributing small cache diffs to users in many cases instead of full cache artifacts. """ from __future__ import annotations import argparse import json import os import sys from collections import defaultdict from typing import Any sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: if sqlite: return SqliteMetadataStore(input_dir) else: return FilesystemMetadataStore(input_dir) def merge_deps(all: dict[str, set[str]], new: dict[str, set[str]]) -> None: for k, v in new.items(): all.setdefault(k, set()).update(v) def load(cache: MetadataStore, s: str) -> Any: data = cache.read(s) obj = json.loads(data) if s.endswith(".meta.json"): # For meta files, zero out the mtimes and sort the # dependencies to avoid spurious conflicts obj["mtime"] = 0 obj["data_mtime"] = 0 if "dependencies" in obj: all_deps = obj["dependencies"] + obj["suppressed"] num_deps = len(obj["dependencies"]) thing = list(zip(all_deps, obj["dep_prios"], obj["dep_lines"])) def unzip(x: Any) -> Any: return zip(*x) if x else ((), (), ()) obj["dependencies"], prios1, lines1 = unzip(sorted(thing[:num_deps])) obj["suppressed"], prios2, lines2 = unzip(sorted(thing[num_deps:])) obj["dep_prios"] = prios1 + prios2 obj["dep_lines"] = lines1 + lines2 if s.endswith(".deps.json"): # For deps files, sort the deps to avoid spurious mismatches for v in obj.values(): v.sort() return obj def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity") parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache") parser.add_argument("input_dir1", help="Input directory for the cache") parser.add_argument("input_dir2", help="Input directory for the cache") parser.add_argument("output", help="Output file") args = parser.parse_args() cache1 = make_cache(args.input_dir1, args.sqlite) cache2 = make_cache(args.input_dir2, args.sqlite) type_misses: dict[str, int] = defaultdict(int) type_hits: dict[str, int] = defaultdict(int) updates: dict[str, str | None] = {} deps1: dict[str, set[str]] = {} deps2: dict[str, set[str]] = {} misses = hits = 0 cache1_all = list(cache1.list_all()) for s in cache1_all: obj1 = load(cache1, s) try: obj2 = load(cache2, s) except FileNotFoundError: obj2 = None typ = s.split(".")[-2] if obj1 != obj2: misses += 1 type_misses[typ] += 1 # Collect the dependencies instead of including them directly in the diff # so we can produce a much smaller direct diff of them. if ".deps." not in s: if obj2 is not None: updates[s] = json.dumps(obj2) else: updates[s] = None elif obj2: merge_deps(deps1, obj1) merge_deps(deps2, obj2) else: hits += 1 type_hits[typ] += 1 cache1_all_set = set(cache1_all) for s in cache2.list_all(): if s not in cache1_all_set: updates[s] = cache2.read(s) # Compute what deps have been added and merge them all into the # @root deps file. new_deps = {k: deps1.get(k, set()) - deps2.get(k, set()) for k in deps2} new_deps = {k: v for k, v in new_deps.items() if v} try: root_deps = load(cache1, "@root.deps.json") except FileNotFoundError: root_deps = {} merge_deps(new_deps, root_deps) new_deps_json = {k: list(v) for k, v in new_deps.items() if v} updates["@root.deps.json"] = json.dumps(new_deps_json) # Drop updates to deps.meta.json for size reasons. The diff # applier will manually fix it up. updates.pop("./@deps.meta.json", None) updates.pop("@deps.meta.json", None) ### print("Generated incremental cache:", hits, "hits,", misses, "misses") if args.verbose: print("hits", type_hits) print("misses", type_misses) with open(args.output, "w") as f: json.dump(updates, f) if __name__ == "__main__": main()