forked from safishamsi/graphify
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_cache.py
More file actions
422 lines (334 loc) · 16.4 KB
/
Copy pathtest_cache.py
File metadata and controls
422 lines (334 loc) · 16.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
"""Tests for graphify/cache.py."""
import pytest
from pathlib import Path
from graphify.cache import file_hash, cache_dir, load_cached, save_cached, cached_files, clear_cache, _body_content
@pytest.fixture
def tmp_file(tmp_path):
f = tmp_path / "sample.txt"
f.write_text("hello world")
return f
@pytest.fixture
def cache_root(tmp_path):
return tmp_path
def test_file_hash_consistent(tmp_file):
"""Same file gives same hash on repeated calls."""
h1 = file_hash(tmp_file)
h2 = file_hash(tmp_file)
assert h1 == h2
assert isinstance(h1, str)
assert len(h1) == 64 # SHA256 hex digest length
def test_file_hash_changes(tmp_path):
"""Different file contents give different hashes."""
f1 = tmp_path / "a.txt"
f2 = tmp_path / "b.txt"
f1.write_text("content one")
f2.write_text("content two")
assert file_hash(f1) != file_hash(f2)
def test_cache_roundtrip(tmp_file, cache_root):
"""Save then load returns the same result dict."""
result = {"nodes": [{"id": "n1", "label": "Node1"}], "edges": []}
save_cached(tmp_file, result, root=cache_root)
loaded = load_cached(tmp_file, root=cache_root)
assert loaded == result
def test_cache_miss_on_change(tmp_file, cache_root):
"""After file content changes, load_cached returns None."""
result = {"nodes": [], "edges": [{"source": "a", "target": "b"}]}
save_cached(tmp_file, result, root=cache_root)
# Modify the file
tmp_file.write_text("completely different content")
assert load_cached(tmp_file, root=cache_root) is None
def test_cached_files(tmp_path, cache_root):
"""cached_files returns the set of cached hashes."""
f1 = tmp_path / "file1.py"
f2 = tmp_path / "file2.py"
f1.write_text("alpha")
f2.write_text("beta")
save_cached(f1, {"nodes": [], "edges": []}, root=cache_root)
save_cached(f2, {"nodes": [], "edges": []}, root=cache_root)
hashes = cached_files(cache_root)
assert file_hash(f1, cache_root) in hashes
assert file_hash(f2, cache_root) in hashes
def test_clear_cache(tmp_file, cache_root):
"""clear_cache removes all .json files from graphify-out/cache/ (all subdirs)."""
save_cached(tmp_file, {"nodes": [], "edges": []}, root=cache_root)
# Since v0.5.3 entries go into cache/ast/, not the flat cache/ dir
cache_base = cache_root / "graphify-out" / "cache"
assert len(list(cache_base.rglob("*.json"))) > 0
clear_cache(cache_root)
assert len(list(cache_base.rglob("*.json"))) == 0
def test_md_frontmatter_only_change_same_hash(tmp_path):
"""Changing only frontmatter fields in a .md file does not change the hash."""
f = tmp_path / "doc.md"
f.write_text("---\nreviewed: 2026-01-01\n---\n\n# Title\n\nBody text.")
h1 = file_hash(f)
f.write_text("---\nreviewed: 2026-04-09\n---\n\n# Title\n\nBody text.")
h2 = file_hash(f)
assert h1 == h2
def test_md_body_change_different_hash(tmp_path):
"""Changing the body of a .md file produces a different hash."""
f = tmp_path / "doc.md"
f.write_text("---\nreviewed: 2026-01-01\n---\n\n# Title\n\nOriginal body.")
h1 = file_hash(f)
f.write_text("---\nreviewed: 2026-01-01\n---\n\n# Title\n\nChanged body.")
h2 = file_hash(f)
assert h1 != h2
def test_md_no_frontmatter_hashed_normally(tmp_path):
"""A .md file with no frontmatter is hashed by its full content."""
f = tmp_path / "doc.md"
f.write_text("# Just a heading\n\nNo frontmatter here.")
h1 = file_hash(f)
f.write_text("# Just a heading\n\nDifferent content.")
h2 = file_hash(f)
assert h1 != h2
def test_non_md_file_hashed_fully(tmp_path):
"""Non-.md files are still hashed by their full content."""
f = tmp_path / "script.py"
f.write_text("# comment\nx = 1")
h1 = file_hash(f)
f.write_text("# changed comment\nx = 1")
h2 = file_hash(f)
assert h1 != h2
def test_body_content_strips_frontmatter():
"""_body_content correctly strips YAML frontmatter."""
content = b"---\ntitle: Test\n---\n\nActual body."
assert _body_content(content) == b"\n\nActual body."
def test_body_content_no_frontmatter():
"""_body_content returns content unchanged when no frontmatter present."""
content = b"No frontmatter here."
assert _body_content(content) == content
# --- #1259: frontmatter delimiters must be whole `---` lines -----------------
def test_body_content_hr_start_is_not_frontmatter():
"""A document opening with a ``----`` thematic break has no frontmatter;
a later ``---`` hr must not be mistaken for a close delimiter."""
content = b"----\nIntro paragraph that must be hashed.\n\n---\nbody"
assert _body_content(content) == content
def test_body_content_dash_title_start_is_not_frontmatter():
"""``--- title`` on the first line is prose, not an open delimiter."""
content = b"--- title\nIntro that must be hashed.\n\n---\nbody"
assert _body_content(content) == content
def test_body_content_dash_text_line_is_not_close_delimiter():
"""``--- text`` and ``----`` lines inside opened frontmatter are not the
close; without a proper close the content passes through unchanged."""
content = b"---\ntitle: Test\nbody starts here\n--- not a delimiter\n----\nreal content"
assert _body_content(content) == content
def test_body_content_later_proper_close_skips_dash_text_lines():
"""A ``--- text`` line is skipped; the next whole ``---`` line closes."""
content = b"---\ntitle: Test\nnote: --- inline\n---\nreal body"
assert _body_content(content) == b"\nreal body"
def test_body_content_well_formed_output_byte_identical():
"""For well-formed frontmatter the stripped body must stay byte-identical
to the historical substring implementation, so existing semantic-cache
hashes do not churn (re-extraction is billed LLM work)."""
cases = [
# (input, output of the historical text.find("\n---")+4 algorithm)
(b"---\ntitle: Test\n---\n\nActual body.", b"\n\nActual body."),
(b"---\nreviewed: 2026-01-01\n---\n\n# Title\n\nBody text.", b"\n\n# Title\n\nBody text."),
# close delimiter with trailing whitespace keeps it in the body
(b"---\ntitle: Test\n--- \nbody", b" \nbody"),
# CRLF line endings
(b"---\r\ntitle: Test\r\n---\r\nbody", b"\r\nbody"),
# empty frontmatter block
(b"---\n---\nbody", b"\nbody"),
# close as the very last line, no trailing newline
(b"---\ntitle: Test\n---", b""),
]
for content, expected in cases:
assert _body_content(content) == expected, content
def test_md_edit_above_hr_changes_hash(tmp_path):
"""Editing content above a mid-document ``----`` break must change the
hash -- previously that region was silently excluded from hashing."""
f = tmp_path / "doc.md"
f.write_text("----\nIntro paragraph.\n\n---\nbody")
h1 = file_hash(f)
f.write_text("----\nEdited intro paragraph.\n\n---\nbody")
h2 = file_hash(f)
assert h1 != h2
# --- #777: portable cache source_file fields --------------------------------
# ``save_cached`` relativizes ``source_file`` entries inside the cache file
# so a committed ``graphify-out/cache/`` is portable across machines and
# CI runners. ``load_cached`` re-absolutizes them so consumers (extract,
# merge into graph.json) see the same shape that fresh extraction emits.
def test_save_cached_relativizes_source_file(tmp_path):
"""The on-disk cache JSON contains forward-slash relative source_file
entries — no absolute prefix from the saving machine leaks in."""
import json
from graphify.cache import save_cached, file_hash, cache_dir
(tmp_path / "src").mkdir()
src = tmp_path / "src" / "foo.py"
src.write_text("def x(): pass\n")
abs_src = str(src.resolve())
result = {
"nodes": [{"id": "n1", "label": "foo", "source_file": abs_src}],
"edges": [{"source": "n1", "target": "n1", "source_file": abs_src}],
}
save_cached(src, result, root=tmp_path, kind="ast")
h = file_hash(src, tmp_path)
entry = cache_dir(tmp_path, "ast") / f"{h}.json"
on_disk = json.loads(entry.read_text(encoding="utf-8"))
node_sources = {n["source_file"] for n in on_disk["nodes"]}
edge_sources = {e["source_file"] for e in on_disk["edges"]}
assert node_sources == {"src/foo.py"}, (
f"cache nodes must store relative source_file; got {node_sources}"
)
assert edge_sources == {"src/foo.py"}
def test_load_cached_absolutizes_source_file(tmp_path):
"""``load_cached`` returns the same absolute-path shape that a fresh
extraction produces, so consumers don't need to special-case cache
hits vs. fresh extraction."""
from graphify.cache import save_cached, load_cached
(tmp_path / "src").mkdir()
src = tmp_path / "src" / "foo.py"
src.write_text("def x(): pass\n")
abs_src = str(src.resolve())
save_cached(src, {
"nodes": [{"id": "n1", "source_file": abs_src}],
"edges": [{"source": "n1", "target": "n1", "source_file": abs_src}],
}, root=tmp_path, kind="ast")
loaded = load_cached(src, root=tmp_path, kind="ast")
assert loaded is not None
assert loaded["nodes"][0]["source_file"] == abs_src
assert loaded["edges"][0]["source_file"] == abs_src
def test_load_cached_passes_through_legacy_absolute_source_file(tmp_path):
"""Cache entries written by an older graphify (with absolute source_file
inside) must still load correctly: the absolutize step is a no-op for
already-absolute values."""
import json
from graphify.cache import load_cached, file_hash, cache_dir
(tmp_path / "src").mkdir()
src = tmp_path / "src" / "foo.py"
src.write_text("pass\n")
abs_src = str(src.resolve())
# Hand-write a legacy-format cache entry (absolute source_file).
h = file_hash(src, tmp_path)
entry = cache_dir(tmp_path, "ast") / f"{h}.json"
entry.write_text(json.dumps({
"nodes": [{"id": "n1", "source_file": abs_src}],
"edges": [],
}))
loaded = load_cached(src, root=tmp_path, kind="ast")
assert loaded is not None
assert loaded["nodes"][0]["source_file"] == abs_src
def test_cache_portable_across_roots(tmp_path):
"""End-to-end portability: a cache entry written at one root can be
consumed at a different absolute root because the file is content-hashed
AND its embedded source_file is stored relative."""
import json
import shutil
from graphify.cache import save_cached, load_cached, file_hash, cache_dir
repo_a = tmp_path / "repo_a"
repo_a.mkdir()
(repo_a / "src").mkdir()
src_a = repo_a / "src" / "foo.py"
src_a.write_text("def x(): pass\n")
save_cached(src_a, {
"nodes": [{"id": "n1", "source_file": str(src_a.resolve())}],
"edges": [],
}, root=repo_a, kind="ast")
# Copy corpus + cache to a second location with a different absolute prefix.
repo_b = tmp_path / "repo_b"
shutil.copytree(repo_a, repo_b)
src_b = repo_b / "src" / "foo.py"
loaded = load_cached(src_b, root=repo_b, kind="ast")
assert loaded is not None, (
"cache must port across absolute prefixes (content hash + relative source_file)"
)
# Source path re-anchored to the new root, not the old one.
assert loaded["nodes"][0]["source_file"] == str(src_b.resolve())
assert not str(repo_a) in loaded["nodes"][0]["source_file"]
# --- AST cache versioning ----------------------------------------------------
# AST cache entries are the output of graphify's own extractor code, so they
# are only valid for the graphify version that wrote them. Keying purely on
# file content meant extractor fixes shipped in a new release kept serving
# stale pre-fix results. The AST cache is therefore namespaced by package
# version; the semantic cache is NOT (invalidating it would re-bill LLM
# extraction for unchanged files).
def test_ast_cache_invalidated_on_version_bump(tmp_path, monkeypatch):
"""An AST entry written by version X must not be served after upgrading
to version Y — the file is unchanged but the extractor is not."""
import graphify.cache as cache_mod
f = tmp_path / "mod.py"
f.write_text("def f(): pass\n")
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.0", raising=False)
save_cached(f, {"nodes": [{"id": "n1"}], "edges": []}, root=tmp_path, kind="ast")
assert load_cached(f, root=tmp_path, kind="ast") is not None
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.1", raising=False)
assert load_cached(f, root=tmp_path, kind="ast") is None, (
"AST cache entry from a previous graphify version must not be served"
)
def test_ast_cache_version_bump_cleans_stale_entries(tmp_path, monkeypatch):
"""Upgrading removes AST entries left behind by previous versions so the
cache directory does not grow one full copy per release."""
import graphify.cache as cache_mod
f = tmp_path / "mod.py"
f.write_text("def f(): pass\n")
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.0", raising=False)
save_cached(f, {"nodes": [{"id": "n1"}], "edges": []}, root=tmp_path, kind="ast")
old_dir = cache_dir(tmp_path, "ast")
assert any(old_dir.glob("*.json"))
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.1", raising=False)
monkeypatch.setattr(cache_mod, "_cleaned_ast_dirs", set(), raising=False)
cache_dir(tmp_path, "ast")
assert not old_dir.exists(), (
"stale AST version directory must be removed on upgrade"
)
def test_legacy_unversioned_ast_entries_not_served(tmp_path):
"""Entries written by pre-versioning graphify (flat cache/ or unversioned
cache/ast/) are by definition from an older extractor and must not be
served — that staleness is exactly what version namespacing fixes."""
import json
from graphify.cache import file_hash, _GRAPHIFY_OUT
f = tmp_path / "mod.py"
f.write_text("def f(): pass\n")
h = file_hash(f, tmp_path)
payload = json.dumps({"nodes": [{"id": "stale"}], "edges": []})
# Unversioned cache/ast/{hash}.json (pre-versioning layout)
unversioned = tmp_path / _GRAPHIFY_OUT / "cache" / "ast"
unversioned.mkdir(parents=True)
(unversioned / f"{h}.json").write_text(payload)
# Legacy flat cache/{hash}.json (pre-0.5.3 layout)
(unversioned.parent / f"{h}.json").write_text(payload)
assert load_cached(f, root=tmp_path, kind="ast") is None
def test_semantic_cache_survives_version_bump(tmp_path, monkeypatch):
"""The semantic cache is deliberately not versioned: entries are produced
by the LLM from file contents, and re-extraction costs real money."""
import graphify.cache as cache_mod
f = tmp_path / "doc.md"
f.write_text("# Title\n\nBody.\n")
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.0", raising=False)
save_cached(f, {"nodes": [{"id": "n1"}], "edges": []}, root=tmp_path, kind="semantic")
semantic_dir = cache_dir(tmp_path, "semantic")
monkeypatch.setattr(cache_mod, "_EXTRACTOR_VERSION", "0.8.1", raising=False)
monkeypatch.setattr(cache_mod, "_cleaned_ast_dirs", set(), raising=False)
cache_dir(tmp_path, "ast") # triggers stale-AST cleanup
assert load_cached(f, root=tmp_path, kind="semantic") is not None
assert any(semantic_dir.glob("*.json")), (
"semantic entries must survive both the version bump and AST cleanup"
)
def test_save_cached_in_root_symlink_keeps_symlink_name(tmp_path):
"""``source_file`` for an in-root symlink must be stored under the
symlink's own name, not the resolved target. Lower-impact than the
manifest case (cache lookup is content-hashed, not key-matched), but
keeps the on-disk shape consistent with what callers passed in."""
import json
from graphify.cache import save_cached, file_hash, cache_dir
(tmp_path / "sub").mkdir()
target = tmp_path / "sub" / "target.py"
target.write_text("pass\n")
alias = tmp_path / "alias.py"
try:
alias.symlink_to(target)
except (OSError, NotImplementedError):
import pytest
pytest.skip("filesystem does not support symlinks")
abs_alias = str(alias) # caller's view — the symlink path, unresolved
save_cached(alias, {
"nodes": [{"id": "n1", "source_file": abs_alias}],
"edges": [],
}, root=tmp_path, kind="ast")
h = file_hash(alias, tmp_path)
entry = cache_dir(tmp_path, "ast") / f"{h}.json"
on_disk = json.loads(entry.read_text(encoding="utf-8"))
assert on_disk["nodes"][0]["source_file"] == "alias.py", (
f"cache must store symlink name, not resolved target; got "
f"{on_disk['nodes'][0]['source_file']!r}"
)