Support fixed-format cache in cache diff scripts (#20827)

JukkaL · web-flow · commit 9b947248f2dd · 2026-02-17T11:26:21.000Z
Also fix some older bytes/str issues.

I tested this manually, and used coding agent to write throwaway unit
tests to ensure the refactoring didn't alter behavior. I may add some
tests later on, but these are a bit tricky to test without any hacks.
diff --git a/misc/apply-cache-diff.py b/misc/apply-cache-diff.py
@@ -3,6 +3,8 @@
 
 With some infrastructure, this can allow for distributing small cache diffs to users in
 many cases instead of full cache artifacts.
+
+Use diff-cache.py to generate a cache diff.
 """
 
 from __future__ import annotations
@@ -13,6 +15,10 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from librt import base64
+from librt.internal import ReadBuffer
+
+from mypy.cache import CacheMeta
 from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
 from mypy.util import json_dumps, json_loads
 
@@ -35,10 +41,19 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
         if data is None:
             cache.remove(file)
         else:
-            cache.write(file, data)
-            if file.endswith(".meta.json") and "@deps" not in file:
-                meta = json_loads(data)
-                old_deps["snapshot"][meta["id"]] = meta["hash"]
+            if file.endswith(".ff"):
+                data_bytes = base64.b64decode(data)
+            else:
+                data_bytes = data.encode() if isinstance(data, str) else data
+            cache.write(file, data_bytes)
+            if file.endswith(".meta.ff") and "@deps" not in file:
+                buf = ReadBuffer(data_bytes[2:])
+                meta = CacheMeta.read(buf, data_file="")
+                assert meta is not None
+                old_deps["snapshot"][meta.id] = meta.hash
+            elif file.endswith(".meta.json") and "@deps" not in file:
+                meta_dict = json_loads(data_bytes)
+                old_deps["snapshot"][meta_dict["id"]] = meta_dict["hash"]
 
     cache.write("@deps.meta.json", json_dumps(old_deps))
 
diff --git a/misc/diff-cache.py b/misc/diff-cache.py
@@ -15,6 +15,10 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from librt import base64
+from librt.internal import ReadBuffer, WriteBuffer
+
+from mypy.cache import CacheMeta
 from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
 from mypy.util import json_dumps, json_loads
 
@@ -31,33 +35,109 @@ def merge_deps(all: dict[str, set[str]], new: dict[str, set[str]]) -> None:
         all.setdefault(k, set()).update(v)
 
 
+def sort_deps(
+    dependencies: list[str], suppressed: list[str], dep_prios: list[int], dep_lines: list[int]
+) -> tuple[list[str], list[str], list[int], list[int]]:
+    """Sort dependencies and suppressed independently, keeping prios/lines aligned."""
+    all_deps = list(zip(dependencies + suppressed, dep_prios, dep_lines))
+    num_deps = len(dependencies)
+    sorted_deps = sorted(all_deps[:num_deps])
+    sorted_supp = sorted(all_deps[num_deps:])
+    if sorted_deps:
+        deps_t, prios1_t, lines1_t = zip(*sorted_deps)
+        deps_out = list(deps_t)
+        prios1 = list(prios1_t)
+        lines1 = list(lines1_t)
+    else:
+        deps_out = []
+        prios1 = []
+        lines1 = []
+    if sorted_supp:
+        supp_t, prios2_t, lines2_t = zip(*sorted_supp)
+        supp_out = list(supp_t)
+        prios2 = list(prios2_t)
+        lines2 = list(lines2_t)
+    else:
+        supp_out = []
+        prios2 = []
+        lines2 = []
+    return deps_out, supp_out, prios1 + prios2, lines1 + lines2
+
+
+def normalize_meta(meta: CacheMeta) -> None:
+    """Normalize a CacheMeta instance to avoid spurious diffs.
+
+    Zero out mtimes and sort dependencies deterministically.
+    """
+    meta.mtime = 0
+    meta.data_mtime = 0
+    meta.dependencies, meta.suppressed, meta.dep_prios, meta.dep_lines = sort_deps(
+        meta.dependencies, meta.suppressed, meta.dep_prios, meta.dep_lines
+    )
+
+
+def serialize_meta_ff(meta: CacheMeta, version_prefix: bytes) -> bytes:
+    """Serialize a CacheMeta instance back to fixed format binary."""
+    buf = WriteBuffer()
+    meta.write(buf)
+    return version_prefix + buf.getvalue()
+
+
+def normalize_json_meta(obj: dict[str, Any]) -> None:
+    """Normalize a JSON meta dict to avoid spurious diffs.
+
+    Zero out mtimes and sort dependencies deterministically.
+    """
+    obj["mtime"] = 0
+    obj["data_mtime"] = 0
+    if "dependencies" in obj:
+        obj["dependencies"], obj["suppressed"], obj["dep_prios"], obj["dep_lines"] = sort_deps(
+            obj["dependencies"], obj["suppressed"], obj["dep_prios"], obj["dep_lines"]
+        )
+
+
 def load(cache: MetadataStore, s: str) -> Any:
+    """Load and normalize a cache entry.
+
+    Returns:
+      - For .meta.ff: normalized binary bytes (with version prefix)
+      - For .data.ff: raw binary bytes
+      - For .meta.json/.data.json/.deps.json: parsed and normalized dict/list
+    """
     data = cache.read(s)
+    if s.endswith(".meta.ff"):
+        version_prefix = data[:2]
+        buf = ReadBuffer(data[2:])
+        meta = CacheMeta.read(buf, data_file="")
+        if meta is None:
+            # Can't deserialize (e.g. different mypy version). Fall back to
+            # raw bytes -- we lose mtime normalization but the diff stays correct.
+            return data
+        normalize_meta(meta)
+        return serialize_meta_ff(meta, version_prefix)
+    if s.endswith(".data.ff"):
+        return data
     obj = json_loads(data)
     if s.endswith(".meta.json"):
-        # For meta files, zero out the mtimes and sort the
-        # dependencies to avoid spurious conflicts
-        obj["mtime"] = 0
-        obj["data_mtime"] = 0
-        if "dependencies" in obj:
-            all_deps = obj["dependencies"] + obj["suppressed"]
-            num_deps = len(obj["dependencies"])
-            thing = list(zip(all_deps, obj["dep_prios"], obj["dep_lines"]))
-
-            def unzip(x: Any) -> Any:
-                return zip(*x) if x else ((), (), ())
-
-            obj["dependencies"], prios1, lines1 = unzip(sorted(thing[:num_deps]))
-            obj["suppressed"], prios2, lines2 = unzip(sorted(thing[num_deps:]))
-            obj["dep_prios"] = prios1 + prios2
-            obj["dep_lines"] = lines1 + lines2
+        normalize_json_meta(obj)
     if s.endswith(".deps.json"):
         # For deps files, sort the deps to avoid spurious mismatches
         for v in obj.values():
             v.sort()
     return obj
 
 
+def encode_for_diff(s: str, obj: object) -> str:
+    """Encode a cache entry value for inclusion in the JSON diff.
+
+    Fixed format binary entries are base64-encoded, JSON entries are
+    re-serialized as JSON strings.
+    """
+    if isinstance(obj, bytes):
+        return base64.b64encode(obj).decode()
+    return json_dumps(obj).decode()
+
+
 def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity")
@@ -73,7 +153,7 @@ def main() -> None:
     type_misses: dict[str, int] = defaultdict(int)
     type_hits: dict[str, int] = defaultdict(int)
 
-    updates: dict[str, bytes | None] = {}
+    updates: dict[str, str | None] = {}
 
     deps1: dict[str, set[str]] = {}
     deps2: dict[str, set[str]] = {}
@@ -96,10 +176,12 @@ def main() -> None:
             # so we can produce a much smaller direct diff of them.
             if ".deps." not in s:
                 if obj2 is not None:
-                    updates[s] = json_dumps(obj2)
+                    updates[s] = encode_for_diff(s, obj2)
                 else:
                     updates[s] = None
             elif obj2:
+                # This is a deps file, with json data
+                assert ".deps." in s
                 merge_deps(deps1, obj1)
                 merge_deps(deps2, obj2)
         else:
@@ -109,7 +191,11 @@ def main() -> None:
     cache1_all_set = set(cache1_all)
     for s in cache2.list_all():
         if s not in cache1_all_set:
-            updates[s] = cache2.read(s)
+            raw = cache2.read(s)
+            if s.endswith(".ff"):
+                updates[s] = base64.b64encode(raw).decode()
+            else:
+                updates[s] = raw.decode()
 
     # Compute what deps have been added and merge them all into the
     # @root deps file.
@@ -122,7 +208,7 @@ def main() -> None:
     merge_deps(new_deps, root_deps)
 
     new_deps_json = {k: list(v) for k, v in new_deps.items() if v}
-    updates["@root.deps.json"] = json_dumps(new_deps_json)
+    updates["@root.deps.json"] = json_dumps(new_deps_json).decode()
 
     # Drop updates to deps.meta.json for size reasons. The diff
     # applier will manually fix it up.