-
-
Notifications
You must be signed in to change notification settings - Fork 43
Expand file tree
/
Copy pathtest_simulate_properties.py
More file actions
298 lines (236 loc) · 10.5 KB
/
test_simulate_properties.py
File metadata and controls
298 lines (236 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""Hypothesis property tests for ``bernstein simulate`` (issue #1374).
Invariants under test:
* Determinism: same plan + same seed -> byte-identical report.
* Monotonicity in budget cap: a stricter cap never widens the no-breach
region; a relaxed cap never shrinks it.
* No negative cost / latency predictions for any task.
* Cost p90 >= cost p50 for every task.
* Latency p90 >= latency p50 for every task.
* Abandon probability always in [0, 1].
* Blast-radius score always in [0, 1].
* Criterion-profile bias sums to ~1.0 for non-empty plans.
* Aggregate totals are the sum of per-task contributions.
* All decision edges reference tasks that exist (no dangling ids).
* Trace history makes the abandon prediction match the observed rate.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import pytest
import yaml
from hypothesis import HealthCheck, given, settings
from hypothesis import strategies as st
from bernstein.core.simulate import SimulationOptions, simulate
# ---------------------------------------------------------------------------
# Strategies
# ---------------------------------------------------------------------------
_KNOWN_ROLES: list[str] = [
"backend",
"frontend",
"qa",
"security",
"docs",
"devops",
"architect",
]
_step_strategy = st.fixed_dictionaries(
{
"title": st.text(alphabet=st.characters(whitelist_categories=("Ll", "Lu", "Nd")), min_size=1, max_size=15),
"role": st.sampled_from(_KNOWN_ROLES),
}
)
_stage_strategy = st.fixed_dictionaries(
{
"name": st.text(alphabet=st.characters(whitelist_categories=("Ll", "Lu", "Nd")), min_size=1, max_size=10),
"steps": st.lists(_step_strategy, min_size=1, max_size=4, unique_by=lambda s: s["title"]),
}
)
_plan_strategy = st.fixed_dictionaries(
{
"name": st.text(min_size=1, max_size=20),
"stages": st.lists(_stage_strategy, min_size=1, max_size=3, unique_by=lambda s: s["name"]),
}
)
_SETTINGS = settings(
max_examples=30,
deadline=None,
suppress_health_check=[HealthCheck.too_slow, HealthCheck.function_scoped_fixture],
)
def _write_plan(tmp_path: Path, plan: dict[str, Any]) -> Path:
target = tmp_path / "plan.yaml"
target.write_text(yaml.safe_dump(plan), encoding="utf-8")
return target
# ---------------------------------------------------------------------------
# Determinism
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy, seed=st.integers(min_value=0, max_value=2**16 - 1))
@_SETTINGS
def test_determinism_same_seed_same_report(
tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any], seed: int
) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
a = simulate(plan_path, SimulationOptions(seed=seed))
b = simulate(plan_path, SimulationOptions(seed=seed))
assert a.to_dict() == b.to_dict()
# ---------------------------------------------------------------------------
# Non-negativity
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy)
@_SETTINGS
def test_costs_are_non_negative(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
for task in report.tasks:
assert task.cost_p50 >= 0.0
assert task.cost_p90 >= 0.0
assert task.latency_p50 >= 0.0
assert task.latency_p90 >= 0.0
@given(plan=_plan_strategy)
@_SETTINGS
def test_p90_at_least_p50(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
for task in report.tasks:
assert task.cost_p90 >= task.cost_p50
assert task.latency_p90 >= task.latency_p50
@given(plan=_plan_strategy)
@_SETTINGS
def test_abandon_probability_in_unit_range(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
for task in report.tasks:
assert 0.0 <= task.abandon_probability <= 1.0
@given(plan=_plan_strategy)
@_SETTINGS
def test_blast_radius_in_unit_range(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
for task in report.tasks:
assert 0.0 <= task.blast_radius_score <= 1.0
# ---------------------------------------------------------------------------
# Aggregate invariants
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy)
@_SETTINGS
def test_aggregate_cost_matches_sum(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
expected_p50 = sum(t.cost_p50 for t in report.tasks)
expected_p90 = sum(t.cost_p90 for t in report.tasks)
assert report.aggregate.total_cost_p50 == pytest.approx(expected_p50)
assert report.aggregate.total_cost_p90 == pytest.approx(expected_p90)
@given(plan=_plan_strategy)
@_SETTINGS
def test_aggregate_max_blast_radius_correct(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
expected = max((t.blast_radius_score for t in report.tasks), default=0.0)
assert report.aggregate.max_blast_radius == pytest.approx(expected)
@given(plan=_plan_strategy)
@_SETTINGS
def test_criterion_bias_sums_to_one(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
bias = report.criterion_bias
total = bias.speed + bias.cost + bias.quality + bias.safety
if report.task_count > 0:
assert total == pytest.approx(1.0)
else:
assert total == pytest.approx(0.0)
# ---------------------------------------------------------------------------
# Decision edges
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy)
@_SETTINGS
def test_decision_edges_reference_real_tasks(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
ids = {t.task_id for t in report.tasks}
for edge in report.decision_edges:
assert edge.from_task == "START" or edge.from_task in ids
assert edge.to_task in ids
# ---------------------------------------------------------------------------
# Budget cap monotonicity
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy, base=st.floats(min_value=0.0, max_value=100.0))
@_SETTINGS
def test_budget_cap_monotonicity(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any], base: float) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
strict = simulate(plan_path, SimulationOptions(budget_cap=base))
relaxed = simulate(plan_path, SimulationOptions(budget_cap=base + 1000.0))
# A relaxed cap can never be a breach when the strict cap was within
# range; and the strict cap can only flag MORE per-task breaches.
if not strict.aggregate.budget_breach:
assert not relaxed.aggregate.budget_breach
strict_violations = sum(1 for t in strict.tasks if t.budget_violation)
relaxed_violations = sum(1 for t in relaxed.tasks if t.budget_violation)
assert strict_violations >= relaxed_violations
# ---------------------------------------------------------------------------
# History overrides cold prior
# ---------------------------------------------------------------------------
@given(plan=_plan_strategy)
@_SETTINGS
def test_history_samples_is_non_negative(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
assert report.history_samples >= 0
@given(plan=_plan_strategy)
@_SETTINGS
def test_task_count_matches_tasks_length(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
assert report.task_count == len(report.tasks)
@given(plan=_plan_strategy)
@_SETTINGS
def test_wall_clock_bands_ordered(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
assert report.aggregate.wall_clock_p90 >= report.aggregate.wall_clock_p50
@given(plan=_plan_strategy)
@_SETTINGS
def test_aggregate_p90_at_least_p50(tmp_path_factory: pytest.TempPathFactory, plan: dict[str, Any]) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan_path = _write_plan(tmp_path, plan)
report = simulate(plan_path)
assert report.aggregate.total_cost_p90 >= report.aggregate.total_cost_p50
@given(observed_rate=st.floats(min_value=0.0, max_value=1.0))
@_SETTINGS
def test_traces_drive_abandon_prediction(
tmp_path_factory: pytest.TempPathFactory,
observed_rate: float,
) -> None:
tmp_path = tmp_path_factory.mktemp("sim")
plan = {
"name": "Single",
"stages": [
{
"name": "Only",
"steps": [{"title": "Task", "role": "backend"}],
}
],
}
plan_path = _write_plan(tmp_path, plan)
traces_dir = tmp_path / "traces"
traces_dir.mkdir()
# Build a 100-record trace file with the requested abandon rate.
rate_int = round(observed_rate * 100)
lines: list[str] = []
for i in range(100):
status = "abandoned" if i < rate_int else "completed"
lines.append('{"role": "backend", "adapter": "mock", "status": "' + status + '", "latency_seconds": 10.0}')
(traces_dir / "t.jsonl").write_text("\n".join(lines) + "\n", encoding="utf-8")
report = simulate(plan_path, SimulationOptions(traces_dir=str(traces_dir)))
assert report.tasks[0].abandon_probability == pytest.approx(rate_int / 100.0, abs=0.01)