Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9b94724

Browse files
authored
Support fixed-format cache in cache diff scripts (#20827)
Also fix some older bytes/str issues. I tested this manually, and used coding agent to write throwaway unit tests to ensure the refactoring didn't alter behavior. I may add some tests later on, but these are a bit tricky to test without any hacks.
1 parent 70de10e commit 9b94724

2 files changed

Lines changed: 125 additions & 24 deletions

File tree

misc/apply-cache-diff.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
44
With some infrastructure, this can allow for distributing small cache diffs to users in
55
many cases instead of full cache artifacts.
6+
7+
Use diff-cache.py to generate a cache diff.
68
"""
79

810
from __future__ import annotations
@@ -13,6 +15,10 @@
1315

1416
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
1517

18+
from librt import base64
19+
from librt.internal import ReadBuffer
20+
21+
from mypy.cache import CacheMeta
1622
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
1723
from mypy.util import json_dumps, json_loads
1824

@@ -35,10 +41,19 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
3541
if data is None:
3642
cache.remove(file)
3743
else:
38-
cache.write(file, data)
39-
if file.endswith(".meta.json") and "@deps" not in file:
40-
meta = json_loads(data)
41-
old_deps["snapshot"][meta["id"]] = meta["hash"]
44+
if file.endswith(".ff"):
45+
data_bytes = base64.b64decode(data)
46+
else:
47+
data_bytes = data.encode() if isinstance(data, str) else data
48+
cache.write(file, data_bytes)
49+
if file.endswith(".meta.ff") and "@deps" not in file:
50+
buf = ReadBuffer(data_bytes[2:])
51+
meta = CacheMeta.read(buf, data_file="")
52+
assert meta is not None
53+
old_deps["snapshot"][meta.id] = meta.hash
54+
elif file.endswith(".meta.json") and "@deps" not in file:
55+
meta_dict = json_loads(data_bytes)
56+
old_deps["snapshot"][meta_dict["id"]] = meta_dict["hash"]
4257

4358
cache.write("@deps.meta.json", json_dumps(old_deps))
4459

misc/diff-cache.py

Lines changed: 106 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515

1616
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
1717

18+
from librt import base64
19+
from librt.internal import ReadBuffer, WriteBuffer
20+
21+
from mypy.cache import CacheMeta
1822
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
1923
from mypy.util import json_dumps, json_loads
2024

@@ -31,33 +35,109 @@ def merge_deps(all: dict[str, set[str]], new: dict[str, set[str]]) -> None:
3135
all.setdefault(k, set()).update(v)
3236

3337

38+
def sort_deps(
39+
dependencies: list[str], suppressed: list[str], dep_prios: list[int], dep_lines: list[int]
40+
) -> tuple[list[str], list[str], list[int], list[int]]:
41+
"""Sort dependencies and suppressed independently, keeping prios/lines aligned."""
42+
all_deps = list(zip(dependencies + suppressed, dep_prios, dep_lines))
43+
num_deps = len(dependencies)
44+
sorted_deps = sorted(all_deps[:num_deps])
45+
sorted_supp = sorted(all_deps[num_deps:])
46+
if sorted_deps:
47+
deps_t, prios1_t, lines1_t = zip(*sorted_deps)
48+
deps_out = list(deps_t)
49+
prios1 = list(prios1_t)
50+
lines1 = list(lines1_t)
51+
else:
52+
deps_out = []
53+
prios1 = []
54+
lines1 = []
55+
if sorted_supp:
56+
supp_t, prios2_t, lines2_t = zip(*sorted_supp)
57+
supp_out = list(supp_t)
58+
prios2 = list(prios2_t)
59+
lines2 = list(lines2_t)
60+
else:
61+
supp_out = []
62+
prios2 = []
63+
lines2 = []
64+
return deps_out, supp_out, prios1 + prios2, lines1 + lines2
65+
66+
67+
def normalize_meta(meta: CacheMeta) -> None:
68+
"""Normalize a CacheMeta instance to avoid spurious diffs.
69+
70+
Zero out mtimes and sort dependencies deterministically.
71+
"""
72+
meta.mtime = 0
73+
meta.data_mtime = 0
74+
meta.dependencies, meta.suppressed, meta.dep_prios, meta.dep_lines = sort_deps(
75+
meta.dependencies, meta.suppressed, meta.dep_prios, meta.dep_lines
76+
)
77+
78+
79+
def serialize_meta_ff(meta: CacheMeta, version_prefix: bytes) -> bytes:
80+
"""Serialize a CacheMeta instance back to fixed format binary."""
81+
buf = WriteBuffer()
82+
meta.write(buf)
83+
return version_prefix + buf.getvalue()
84+
85+
86+
def normalize_json_meta(obj: dict[str, Any]) -> None:
87+
"""Normalize a JSON meta dict to avoid spurious diffs.
88+
89+
Zero out mtimes and sort dependencies deterministically.
90+
"""
91+
obj["mtime"] = 0
92+
obj["data_mtime"] = 0
93+
if "dependencies" in obj:
94+
obj["dependencies"], obj["suppressed"], obj["dep_prios"], obj["dep_lines"] = sort_deps(
95+
obj["dependencies"], obj["suppressed"], obj["dep_prios"], obj["dep_lines"]
96+
)
97+
98+
3499
def load(cache: MetadataStore, s: str) -> Any:
100+
"""Load and normalize a cache entry.
101+
102+
Returns:
103+
- For .meta.ff: normalized binary bytes (with version prefix)
104+
- For .data.ff: raw binary bytes
105+
- For .meta.json/.data.json/.deps.json: parsed and normalized dict/list
106+
"""
35107
data = cache.read(s)
108+
if s.endswith(".meta.ff"):
109+
version_prefix = data[:2]
110+
buf = ReadBuffer(data[2:])
111+
meta = CacheMeta.read(buf, data_file="")
112+
if meta is None:
113+
# Can't deserialize (e.g. different mypy version). Fall back to
114+
# raw bytes -- we lose mtime normalization but the diff stays correct.
115+
return data
116+
normalize_meta(meta)
117+
return serialize_meta_ff(meta, version_prefix)
118+
if s.endswith(".data.ff"):
119+
return data
36120
obj = json_loads(data)
37121
if s.endswith(".meta.json"):
38-
# For meta files, zero out the mtimes and sort the
39-
# dependencies to avoid spurious conflicts
40-
obj["mtime"] = 0
41-
obj["data_mtime"] = 0
42-
if "dependencies" in obj:
43-
all_deps = obj["dependencies"] + obj["suppressed"]
44-
num_deps = len(obj["dependencies"])
45-
thing = list(zip(all_deps, obj["dep_prios"], obj["dep_lines"]))
46-
47-
def unzip(x: Any) -> Any:
48-
return zip(*x) if x else ((), (), ())
49-
50-
obj["dependencies"], prios1, lines1 = unzip(sorted(thing[:num_deps]))
51-
obj["suppressed"], prios2, lines2 = unzip(sorted(thing[num_deps:]))
52-
obj["dep_prios"] = prios1 + prios2
53-
obj["dep_lines"] = lines1 + lines2
122+
normalize_json_meta(obj)
54123
if s.endswith(".deps.json"):
55124
# For deps files, sort the deps to avoid spurious mismatches
56125
for v in obj.values():
57126
v.sort()
58127
return obj
59128

60129

130+
def encode_for_diff(s: str, obj: object) -> str:
131+
"""Encode a cache entry value for inclusion in the JSON diff.
132+
133+
Fixed format binary entries are base64-encoded, JSON entries are
134+
re-serialized as JSON strings.
135+
"""
136+
if isinstance(obj, bytes):
137+
return base64.b64encode(obj).decode()
138+
return json_dumps(obj).decode()
139+
140+
61141
def main() -> None:
62142
parser = argparse.ArgumentParser()
63143
parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity")
@@ -73,7 +153,7 @@ def main() -> None:
73153
type_misses: dict[str, int] = defaultdict(int)
74154
type_hits: dict[str, int] = defaultdict(int)
75155

76-
updates: dict[str, bytes | None] = {}
156+
updates: dict[str, str | None] = {}
77157

78158
deps1: dict[str, set[str]] = {}
79159
deps2: dict[str, set[str]] = {}
@@ -96,10 +176,12 @@ def main() -> None:
96176
# so we can produce a much smaller direct diff of them.
97177
if ".deps." not in s:
98178
if obj2 is not None:
99-
updates[s] = json_dumps(obj2)
179+
updates[s] = encode_for_diff(s, obj2)
100180
else:
101181
updates[s] = None
102182
elif obj2:
183+
# This is a deps file, with json data
184+
assert ".deps." in s
103185
merge_deps(deps1, obj1)
104186
merge_deps(deps2, obj2)
105187
else:
@@ -109,7 +191,11 @@ def main() -> None:
109191
cache1_all_set = set(cache1_all)
110192
for s in cache2.list_all():
111193
if s not in cache1_all_set:
112-
updates[s] = cache2.read(s)
194+
raw = cache2.read(s)
195+
if s.endswith(".ff"):
196+
updates[s] = base64.b64encode(raw).decode()
197+
else:
198+
updates[s] = raw.decode()
113199

114200
# Compute what deps have been added and merge them all into the
115201
# @root deps file.
@@ -122,7 +208,7 @@ def main() -> None:
122208
merge_deps(new_deps, root_deps)
123209

124210
new_deps_json = {k: list(v) for k, v in new_deps.items() if v}
125-
updates["@root.deps.json"] = json_dumps(new_deps_json)
211+
updates["@root.deps.json"] = json_dumps(new_deps_json).decode()
126212

127213
# Drop updates to deps.meta.json for size reasons. The diff
128214
# applier will manually fix it up.

0 commit comments

Comments
 (0)