-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathqwen-story.sh
More file actions
executable file
·336 lines (318 loc) · 12.3 KB
/
qwen-story.sh
File metadata and controls
executable file
·336 lines (318 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/usr/bin/env bash
# qwen-story.sh - 8-beat narrative exercising every core apr command group
# against the Qwen series (0.5B safetensors → 30B-MoE GGUF). Each beat is a
# falsifier in contracts/qwen-story-v1.yaml. Used by:
#
# - README.md ## A Qwen story (top-of-fold quickstart)
# - .claude/skills/dogfood Gate 18 (regression detection)
# - .github/workflows/qwen-story-daily.yml (nightly bug-hunt cron)
#
# Exit codes:
# 0 all runnable beats PASS
# 2 one or more beats FAIL
# 3 one or more beats SKIP (missing model) - informational, also exits 0
# if SKIPs are the only non-PASS results
#
# Each beat uses OUT=$(cmd); EC=$? to avoid the pipe-then-$? methodology
# defect documented in memory/feedback_test_methodology_can_fake_bugs.md.
set -uo pipefail
MODELS_DIR="${MODELS_DIR:-$HOME/models}"
PMAT_HUNT="${PMAT_HUNT:-1}" # 1 = run pmat full audit per beat
TMPDIR_STORY="${TMPDIR_STORY:-/tmp/qwen-story-$$}"
mkdir -p "$TMPDIR_STORY"
trap '[ -n "$TMPDIR_STORY" ] && [ "$TMPDIR_STORY" != "/" ] && rm -rf "$TMPDIR_STORY" 2>/dev/null; pkill -P $$ 2>/dev/null || true' EXIT
# -- Model registry ------------------------------------------------------------
M_05B_ST="$MODELS_DIR/qwen2.5-coder-0.5b-instruct-safetensors/model.safetensors"
M_15B_APR="$MODELS_DIR/qwen2.5-coder-1.5b-instruct-q4k.apr"
M_7B_GGUF="$MODELS_DIR/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
M_30B_MOE="$MODELS_DIR/Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf"
PASS=0
FAIL=0
SKIP=0
FAILED_BEATS=()
note() { printf ' %s\n' "$*"; }
emit_pass(){ PASS=$((PASS+1)); printf '✓ PASS %s\n' "$1"; }
emit_fail(){ FAIL=$((FAIL+1)); FAILED_BEATS+=("$1"); printf '✗ FAIL %s - %s\n' "$1" "$2"; }
emit_skip(){ SKIP=$((SKIP+1)); printf '○ SKIP %s - %s\n' "$1" "$2"; }
# Run a single apr command with a timeout, capture exit + last-line output.
# Args: timeout_seconds command...
# Sets globals: RC_EC, RC_OUT, RC_TAIL
run_cmd() {
local t="$1"; shift
RC_OUT=$(timeout "$t" "$@" 2>&1); RC_EC=$?
RC_TAIL=$(echo "$RC_OUT" | tail -1)
}
# Run pmat full audit on a list of command module patterns. Outputs a compact
# manifest (top 3 high-risk untested functions, top 3 churn, top 3 faults).
pmat_hunt() {
local beat="$1"; shift
if [ "$PMAT_HUNT" != "1" ] || ! command -v pmat >/dev/null 2>&1; then
return 0
fi
printf ' -- pmat bug-hunt manifest (%s) --\n' "$beat"
for q in "$@"; do
local gaps churn faults
gaps=$(pmat query --coverage-gaps --path "$q" --rank-by impact --limit 3 \
--format json 2>/dev/null | jq -r '.[] | " gap \(.function) (impact=\(.impact_score // "?"))"' 2>/dev/null | head -3)
churn=$(pmat query "$beat" --path "$q" --churn --max-complexity 30 --limit 3 \
--format json 2>/dev/null | jq -r '.[] | " churn \(.function) (commits=\(.churn.commit_count // "?"))"' 2>/dev/null | head -3)
faults=$(pmat query "$beat" --path "$q" --faults --exclude-tests --limit 3 \
--format json 2>/dev/null | jq -r '.[] | " fault \(.function) (\(.faults | join(\",\")))"' 2>/dev/null | head -3)
[ -n "$gaps" ] && printf '%s\n' "$gaps"
[ -n "$churn" ] && printf '%s\n' "$churn"
[ -n "$faults" ] && printf '%s\n' "$faults"
done
printf '\n'
}
# -- Beat 1: Discover (0.5B SafeTensors) --------------------------------------─
beat1_discover() {
printf -- '-- Beat 1: Discover (Registry) --\n'
if [ ! -f "$M_05B_ST" ]; then
# In CI this is the model we'd PULL; locally we use cache.
emit_skip "B1 pull" "0.5B SafeTensors not in cache at $M_05B_ST"
return
fi
run_cmd 30 apr list
if [ "$RC_EC" -eq 0 ]; then
emit_pass "B1 list"
else
emit_fail "B1 list" "exit=$RC_EC"
return
fi
pmat_hunt "registry list" crates/apr-cli/src/commands/list.rs crates/apr-cli/src/commands/pull.rs
}
# -- Beat 2: Trust (0.5B safetensors) ------------------------------------------
beat2_trust() {
printf -- '-- Beat 2: Trust (QA gates) --\n'
if [ ! -f "$M_15B_APR" ]; then
emit_skip "B2 qa" "1.5B APR not available at $M_15B_APR"
return
fi
# Use 1.5B APR (apr qa Golden Output gate works on this; 7B has #1864).
run_cmd 180 apr qa "$M_15B_APR"
if echo "$RC_OUT" | grep -q "ALL GATES PASSED"; then
emit_pass "B2 apr qa"
else
emit_fail "B2 apr qa" "no 'ALL GATES PASSED' line"
return
fi
run_cmd 60 apr validate "$M_15B_APR" --quality
if [ "$RC_EC" -eq 0 ]; then
emit_pass "B2 apr validate --quality"
else
emit_fail "B2 apr validate --quality" "exit=$RC_EC (after #1866 fix this should be 0)"
fi
run_cmd 30 apr lint "$M_15B_APR"
if [ "$RC_EC" -eq 0 ] || [ "$RC_EC" -eq 5 ]; then
emit_pass "B2 apr lint (exit=$RC_EC)"
else
emit_fail "B2 apr lint" "exit=$RC_EC"
fi
pmat_hunt "qa validate lint" \
crates/apr-cli/src/commands/qa.rs \
crates/apr-cli/src/commands/validate.rs \
crates/apr-cli/src/commands/lint.rs
}
# -- Beat 3: Explore (1.5B APR - has tokenizer next to it) --------------------─
beat3_explore() {
printf -- '-- Beat 3: Explore (Inspection) --\n'
if [ ! -f "$M_15B_APR" ]; then
emit_skip "B3 inspect" "no APR model"
return
fi
run_cmd 30 apr inspect --json "$M_15B_APR"
local arch
arch=$(echo "$RC_OUT" | jq -r '.architecture // empty' 2>/dev/null)
if [ "$RC_EC" -eq 0 ] && [ -n "$arch" ]; then
emit_pass "B3 apr inspect --json (arch=$arch)"
else
emit_fail "B3 apr inspect --json" "exit=$RC_EC arch='$arch'"
fi
run_cmd 30 apr tensors "$M_15B_APR" --json
local n
n=$(echo "$RC_OUT" | jq '.tensor_count // (.|length) // 0' 2>/dev/null)
if [ "$RC_EC" -eq 0 ] && [ "${n:-0}" -gt 0 ]; then
emit_pass "B3 apr tensors --json ($n tensors)"
else
emit_fail "B3 apr tensors --json" "exit=$RC_EC n=$n"
fi
run_cmd 30 apr tree "$M_15B_APR"
[ "$RC_EC" -eq 0 ] && emit_pass "B3 apr tree" || emit_fail "B3 apr tree" "exit=$RC_EC"
pmat_hunt "inspect tensors tree" \
crates/apr-cli/src/commands/inspect.rs \
crates/apr-cli/src/commands/tensors.rs \
crates/apr-cli/src/commands/tree.rs
}
# -- Beat 4: Adapt (export + diff; convert path covered by Beat 1 pull) --------
beat4_adapt() {
printf -- '-- Beat 4: Adapt (Model ops) --\n'
if [ ! -f "$M_15B_APR" ]; then
emit_skip "B4 export" "no APR model"
return
fi
# apr export (post-#1865 fix: panic → graceful error or success)
local out="$TMPDIR_STORY/exported.gguf"
rm -f "$out"
run_cmd 120 apr export "$M_15B_APR" --format gguf -o "$out"
if [ "$RC_EC" -eq 0 ] && [ -s "$out" ]; then
emit_pass "B4 apr export → gguf"
run_cmd 30 apr diff "$M_15B_APR" "$out"
if [ "$RC_EC" -eq 0 ]; then
emit_pass "B4 apr diff (APR vs round-tripped GGUF)"
else
emit_fail "B4 apr diff" "exit=$RC_EC"
fi
elif [ "$RC_EC" -eq 5 ]; then
# Clean validation error is acceptable post-#1865 (e.g. missing num_heads).
emit_pass "B4 apr export (clean exit=5, no panic)"
elif [ "$RC_EC" -eq 101 ] || echo "$RC_OUT" | grep -qE 'thread.*panicked'; then
emit_fail "B4 apr export" "PANIC (exit=$RC_EC) - #1865 regression"
else
emit_fail "B4 apr export" "unexpected exit=$RC_EC"
fi
pmat_hunt "export convert quantize" \
crates/aprender-core/src/format/converter/metadata.rs \
crates/apr-cli/src/commands/convert.rs \
crates/apr-cli/src/commands/quantize.rs
}
# -- Beat 5: Use (1.5B Q4K APR) ------------------------------------------------
beat5_use() {
printf -- '-- Beat 5: Use (Inference) --\n'
if [ ! -f "$M_15B_APR" ]; then
emit_skip "B5 run" "no APR model"
return
fi
run_cmd 120 apr run "$M_15B_APR" "fn sum(a: i32, b: i32) -> i32 {" --max-tokens 16
# Heuristic gibberish detector - flag if chat-template tokens repeat.
if echo "$RC_OUT" | grep -qE '<\|im_start\|>.*<\|im_start\|>'; then
emit_fail "B5 apr run" "gibberish (chat-template token repeats)"
elif [ "$RC_EC" -eq 0 ] && echo "$RC_OUT" | grep -qE 'Output:'; then
emit_pass "B5 apr run (Rust code completion)"
else
emit_fail "B5 apr run" "exit=$RC_EC, no Output line"
fi
# apr code -p (non-interactive coder agent)
run_cmd 90 apr code -p "Reply with exactly: hello" --max-turns 1
if [ "$RC_EC" -eq 0 ]; then
emit_pass "B5 apr code -p"
else
emit_skip "B5 apr code -p" "non-zero exit=$RC_EC (may need --model)"
fi
pmat_hunt "run chat code" \
crates/apr-cli/src/commands/run.rs \
crates/apr-cli/src/commands/chat.rs \
crates/apr-cli/src/commands/code.rs
}
# -- Beat 6: Serve (1.5B over HTTP) --------------------------------------------
beat6_serve() {
printf -- '-- Beat 6: Serve (REST API) --\n'
if [ ! -f "$M_15B_APR" ]; then
emit_skip "B6 serve" "no APR model"
return
fi
local port=$((20000 + RANDOM % 10000))
apr serve run "$M_15B_APR" --port "$port" > "$TMPDIR_STORY/serve.log" 2>&1 &
local pid=$!
# Wait up to 60s for /health to come up.
local up=0
for _ in $(seq 1 60); do
if curl -s -m 2 "http://127.0.0.1:$port/health" >/dev/null 2>&1; then
up=1; break
fi
sleep 1
done
if [ "$up" = "0" ]; then
kill "$pid" 2>/dev/null; wait "$pid" 2>/dev/null
emit_fail "B6 apr serve run" "server did not start within 60s"
return
fi
emit_pass "B6 apr serve run (port=$port)"
# /v1/chat/completions OpenAI-compat smoke
local resp
resp=$(curl -s -m 60 -X POST "http://127.0.0.1:$port/v1/chat/completions" \
-H 'Content-Type: application/json' \
-d '{"model":"qwen","messages":[{"role":"user","content":"reply with: ok"}],"max_tokens":4}')
local content
content=$(echo "$resp" | jq -r '.choices[0].message.content // empty' 2>/dev/null)
if [ -n "$content" ]; then
emit_pass "B6 /v1/chat/completions (got $(echo "$content" | head -c 20)...)"
else
emit_fail "B6 /v1/chat/completions" "no message.content in response"
fi
kill "$pid" 2>/dev/null; wait "$pid" 2>/dev/null
pmat_hunt "serve http chat-completions" \
crates/apr-cli/src/commands/serve.rs \
crates/aprender-serve/src/api/cuda_chat_backend.rs
}
# -- Beat 7: Operate (7B Q4K GGUF - profile/bench, NOT apr qa which has #1864) ─
beat7_operate() {
printf -- '-- Beat 7: Operate (Profiling) --\n'
if [ ! -f "$M_7B_GGUF" ]; then
emit_skip "B7 profile" "7B GGUF not at $M_7B_GGUF"
return
fi
# profile/bench/parity don't actually generate; safe even with #1864 open.
run_cmd 60 apr profile "$M_7B_GGUF"
[ "$RC_EC" -eq 0 ] && emit_pass "B7 apr profile" \
|| emit_fail "B7 apr profile" "exit=$RC_EC"
run_cmd 30 apr gpu --json
[ "$RC_EC" -eq 0 ] && emit_pass "B7 apr gpu --json" \
|| emit_fail "B7 apr gpu --json" "exit=$RC_EC"
run_cmd 60 apr serve plan "$M_7B_GGUF"
[ "$RC_EC" -eq 0 ] && emit_pass "B7 apr serve plan -- 7B VRAM budget" \
|| emit_fail "B7 apr serve plan" "exit=$RC_EC"
pmat_hunt "profile bench gpu parity" \
crates/apr-cli/src/commands/profile.rs \
crates/apr-cli/src/commands/bench.rs \
crates/apr-cli/src/commands/gpu.rs \
crates/apr-cli/src/commands/parity.rs
}
# -- Beat 8: Scale (30B-MoE) --------------------------------------------------─
beat8_scale() {
printf -- '-- Beat 8: Scale (MoE introspection) --\n'
if [ ! -f "$M_30B_MOE" ]; then
emit_skip "B8 inspect MoE" "30B-MoE not at $M_30B_MOE"
return
fi
run_cmd 60 apr inspect --json "$M_30B_MOE"
local arch
arch=$(echo "$RC_OUT" | jq -r '.architecture // empty' 2>/dev/null)
if [ "$arch" = "qwen3moe" ]; then
emit_pass "B8 apr inspect --json (arch=qwen3moe)"
else
emit_fail "B8 apr inspect --json" "arch='$arch' (expected qwen3moe)"
fi
run_cmd 60 apr tensors --json "$M_30B_MOE"
local n
n=$(echo "$RC_OUT" | jq '.tensor_count // 0' 2>/dev/null)
if [ "${n:-0}" -gt 500 ]; then
emit_pass "B8 apr tensors --json ($n tensors)"
else
emit_fail "B8 apr tensors --json" "$n tensors (expected ≥500 for 30B-MoE)"
fi
pmat_hunt "moe inspect qwen3" \
crates/aprender-serve/src/infer/qwen3_moe_generate.rs \
crates/aprender-serve/src/api/cuda_chat_backend.rs
}
# -- Main ----------------------------------------------------------------------
START=$(date +%s)
printf '\n=== Qwen Story v1 - apr=%s, dir=%s ===\n\n' "$(apr --version 2>&1 | head -1)" "$MODELS_DIR"
beat1_discover
beat2_trust
beat3_explore
beat4_adapt
beat5_use
beat6_serve
beat7_operate
beat8_scale
ELAPSED=$(($(date +%s) - START))
printf '\n=== Story complete in %ds ===\n' "$ELAPSED"
printf ' %d PASS / %d FAIL / %d SKIP\n' "$PASS" "$FAIL" "$SKIP"
if [ "$FAIL" -gt 0 ]; then
printf '\nFailed beats:\n'
for b in "${FAILED_BEATS[@]}"; do
printf ' - %s\n' "$b"
done
exit 2
fi
exit 0