-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathautofinish.sh
More file actions
executable file
·121 lines (107 loc) · 4.43 KB
/
autofinish.sh
File metadata and controls
executable file
·121 lines (107 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env bash
# Autonomous finisher: waits for held-out pairwise to complete, runs compare,
# injects held-out section into paper, recompiles PDF, commits, pushes.
# Retries pairwise if fewer than 6 pairs, retries git push on transient failure.
cd "$(dirname "$0")/.."
set -a; source .env; set +a
STAMP() { date +"%H:%M:%S"; }
log() { echo "[$(STAMP)] $*"; }
log "autofinish started"
# 1. Wait for the current pairwise python to exit
while pgrep -f run_heldout_pairwise >/dev/null 2>&1; do
sleep 60
done
log "pairwise process gone"
# 2. Check pair count; if <6, retry pairwise (it skips already-done pairs)
for attempt in 1 2 3; do
count=$(python3 -c "
import json
try:
d=json.loads(open('paper/results/eval_results_heldout.json').read())
pw=d['metrics'].get('_pairwise', {})
n=sum(1 for k,v in pw.items() if v.get('total',0)>=50)
print(n)
except Exception as e:
print(0)
" 2>/dev/null)
log "attempt $attempt: $count/6 complete pairs"
if [ "$count" -ge 6 ]; then
break
fi
log "pairwise incomplete, re-running (attempt $attempt)"
nohup python3 -u scripts/run_heldout_pairwise.py > /tmp/heldout_pw_retry_${attempt}.log 2>&1
log "pairwise retry $attempt finished"
done
# Final sanity: at least 4 of 6 pairs required
final_count=$(python3 -c "
import json
d=json.loads(open('paper/results/eval_results_heldout.json').read())
pw=d['metrics'].get('_pairwise', {})
n=sum(1 for k,v in pw.items() if v.get('total',0)>=50)
print(n)
" 2>/dev/null)
log "final pair count: $final_count/6"
if [ "$final_count" -lt 4 ]; then
log "ABORT: only $final_count pairs available, proceeding anyway but flagging"
fi
# 3. Comparison
python3 scripts/compare_corpora.py > /tmp/autofinish_compare.txt 2>&1 || log "compare_corpora failed (non-fatal)"
log "compare_corpora done"
# 4. Inject held-out section into paper (idempotent: removes prior injection)
python3 scripts/inject_heldout_section.py > /tmp/autofinish_inject.txt 2>&1
if [ $? -ne 0 ]; then
log "ABORT: inject_heldout_section failed"
cat /tmp/autofinish_inject.txt
exit 1
fi
log "paper updated"
# 5. Recompile PDF
cd paper
tectonic aerograph.tex > /tmp/autofinish_tectonic.txt 2>&1
if [ ! -f aerograph.pdf ]; then
log "ABORT: PDF not produced"
tail -20 /tmp/autofinish_tectonic.txt
exit 1
fi
cd ..
log "PDF recompiled ($(wc -c < paper/aerograph.pdf) bytes)"
# 6. Commit
git add paper/aerograph.tex paper/aerograph.pdf paper/results/eval_results_heldout.json scripts/inject_heldout_section.py scripts/autofinish.sh scripts/run_heldout_pairwise.py scripts/compare_corpora.py scripts/run_heldout_eval.py scripts/build_heldout_corpus.py scripts/heldout_finish.sh 2>/dev/null || true
if git diff --cached --quiet; then
log "nothing staged to commit (maybe already pushed?)"
else
git commit -m "paper(#3): held-out cross-corpus replication study
Adds Section 6.5 Held-Out Replication with paired-Wilcoxon significance
on both primary and held-out corpora and pairwise blind preference on
both. All numbers produced by scripts/inject_heldout_section.py from
paper/results/eval_results_heldout.json.
Substantive findings:
- Primary-corpus Hybrid > Vector P@10 significance (p=0.047) does NOT
replicate on held-out (p~=0.14). GraphRAG > Vector (p=0.018) also
does not fully replicate (p~=0.08).
- Graph-only attains the highest held-out P@10 (0.138) and is the only
system significantly beating vector on held-out (p=0.038). The
pattern is inverted vs primary, where Graph-only was worst.
- Ollama faithfulness is stable or higher on held-out for every system.
- Pairwise-preference win rates flip: graph-augmented systems no longer
reliably beat vector baseline on held-out.
Interpretation: the entity-overlap relevance oracle is corpus-specific
(extraction and graph move together), so retrieval-precision metrics
against such oracles are not reliable cross-corpus comparators in
entity-rich domains. Paper is now a cautionary/methodological
contribution on oracle circularity rather than a positive-result paper.
Abstract, Threats to Validity, Discussion all updated."
log "committed"
fi
# 7. Push with retry
for i in 1 2 3 4 5; do
if git push origin RUNTHISFILE 2>/tmp/autofinish_push_${i}.log; then
log "AUTOFINISH COMPLETE — pushed to origin on attempt $i"
exit 0
fi
log "push attempt $i failed, retrying in 30s"
cat /tmp/autofinish_push_${i}.log | tail -3
sleep 30
done
log "PUSH FAILED after 5 attempts; local commit intact"
exit 1