-
Notifications
You must be signed in to change notification settings - Fork 501
Expand file tree
/
Copy pathevaluator.py
More file actions
346 lines (290 loc) · 12.5 KB
/
Copy pathevaluator.py
File metadata and controls
346 lines (290 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#
# PySceneDetect: Python-Based Video Scene Detector
# -------------------------------------------------------------------
# [ Site: https://scenedetect.com ]
# [ Docs: https://scenedetect.com/docs/ ]
# [ Github: https://github.com/Breakthrough/PySceneDetect/ ]
#
# Copyright (C) 2026 Brandon Castellano <http://www.bcastell.com>.
# PySceneDetect is licensed under the BSD 3-Clause License; see the
# included LICENSE file, or visit one of the above pages for details.
#
"""Scoring for shot-boundary-detection benchmarks.
Implements the TRECVID-SBD evaluation convention. Each predicted boundary is one integer frame
number. Hard cuts are matched against ground-truth frames, with a configurable frame-tolerance.
Matches are scored via greedy 1-to-1 nearest-neighbor assignment. Fades and other gradual
transitions are matched by point-in-interval membership, where the prediction inside an interval
is considered a match. Other predictions in the same interval are considered false positives.
References:
- Smeaton, Over & Doherty (2010), "Video shot boundary detection: Seven years of TRECVid activity",
*Computer Vision and Image Understanding*.
https://ora.ox.ac.uk/objects/uuid:868aebdf-298a-4567-b47f-c8f9e3a6ac7a
- Hassanien et al. (2017), "Large-scale, Fast and Accurate Shot Boundary Detection through
Spatio-temporal Convolutional Neural Networks", arXiv:1705.03281.
https://arxiv.org/abs/1705.03281
"""
from __future__ import annotations
import math
from collections.abc import Iterable
from dataclasses import dataclass, field
from pathlib import Path
from statistics import mean
from typing import TypeAlias
# 1-based frame number, matching the convention used by the BBC/AutoShot text annotations and by
# PySceneDetect's :class:`FrameTimecode`. Used for cut positions and for tolerance windows.
#
# Ironically, all the work we did in v0.7 to support VFR is meaningless for most existing benchmarks
# since they are all CFR. In the future we should consider extending the API to support temporal
# units of time or PTS, and also see if other datasets might take this into account.
Frames: TypeAlias = int
@dataclass(frozen=True)
class EventInterval:
"""Inclusive ``[start, end]`` frame range for a gradual transition (dissolve/fade)."""
start: Frames
end: Frames
def contains(self, frame: Frames) -> bool:
return self.start <= frame <= self.end
@dataclass
class GroundTruth:
"""Ground truth for one video, consisting of hard cut frames and fade intervals."""
hard_cuts: list[Frames]
fades: list[EventInterval] = field(default_factory=list)
category: str | None = None
@dataclass
class Prediction:
"""One detector run on one video, ready for scoring against typed ground truth."""
predicted_cuts: list[Frames]
"""Flat list of predicted hard cut frame numbers, 1-based."""
ground_truth: GroundTruth
"""Ground truth for the video being scored."""
elapsed: float
"""How long it took to run the prediction, in seconds. Used for performance not accuracy."""
@dataclass
class EventMetrics:
"""Per-event-type scoring counts used to calculate precision, recall, and F1 score.
Each instance should be used to score *one* event type (either hard cuts *or* fade transitions)
against ground truth.
"""
# Detector fired on a real event in the ground truth.
matched: int = 0
# Detector fired but there was no real event at that frame.
false_positives: int = 0
# Real event in the ground truth that the detector failed to fire on.
missed: int = 0
@property
def precision(self) -> float:
denom = self.matched + self.false_positives
return self.matched / denom if denom else 0.0
@property
def recall(self) -> float:
denom = self.matched + self.missed
return self.matched / denom if denom else 0.0
@property
def f1(self) -> float:
p, r = self.precision, self.recall
return 2 * p * r / (p + r) if (p + r) else 0.0
def __add__(self, other: EventMetrics) -> EventMetrics:
return EventMetrics(
matched=self.matched + other.matched,
false_positives=self.false_positives + other.false_positives,
missed=self.missed + other.missed,
)
def to_dict(self) -> dict:
return {
"matched": self.matched,
"false_positives": self.false_positives,
"missed": self.missed,
"precision": round(self.precision * 100, 4),
"recall": round(self.recall * 100, 4),
"f1": round(self.f1 * 100, 4),
}
@dataclass
class VideoMetrics:
"""Per-video result at one tolerance. The video's path lives in the enclosing
:class:`BenchmarkResult.per_video` dict key, not on this object."""
elapsed: float
category: str | None
hard_cuts: EventMetrics
fades: EventMetrics
# (sum of |prediction - ground_truth|, match count) over hard-cut matches.
# Stored as raw sums so aggregation across videos is `sum / total_matched`,
# not a mean-of-means.
hard_offset: tuple[float, int]
@property
def mean_abs_offset(self) -> float:
s, n = self.hard_offset
return s / n if n else math.nan
def to_dict(self) -> dict:
return {
"elapsed": self.elapsed,
"category": self.category,
"hard_cuts": self.hard_cuts.to_dict(),
"fades": self.fades.to_dict(),
"mean_abs_offset_hard_cuts": self.mean_abs_offset,
}
@dataclass
class BenchmarkResult:
"""Aggregate result of running one detector configuration on a dataset at one tolerance.
``per_video`` is keyed by source video path so per-video lookups are explicit; aggregate
properties sum counts across all videos (same convention used by TRECVID).
"""
per_video: dict[Path, VideoMetrics]
tolerance: Frames
@property
def hard_cuts(self) -> EventMetrics:
total = EventMetrics()
for v in self.per_video.values():
total = total + v.hard_cuts
return total
@property
def fades(self) -> EventMetrics:
total = EventMetrics()
for v in self.per_video.values():
total = total + v.fades
return total
@property
def mean_abs_offset_hard_cuts(self) -> float:
num = sum(v.hard_offset[0] for v in self.per_video.values())
den = sum(v.hard_offset[1] for v in self.per_video.values())
return num / den if den else math.nan
@property
def elapsed_total(self) -> float:
return sum(v.elapsed for v in self.per_video.values())
@property
def elapsed_mean(self) -> float:
return mean(v.elapsed for v in self.per_video.values()) if self.per_video else 0.0
def by_category(self) -> dict[str, BenchmarkResult]:
buckets: dict[str, dict[Path, VideoMetrics]] = {}
for path, v in self.per_video.items():
buckets.setdefault(v.category or "unknown", {})[path] = v
return {
g: BenchmarkResult(per_video=vids, tolerance=self.tolerance)
for g, vids in buckets.items()
}
def to_dict(self, root: Path | None = None) -> dict:
def _fmt_path(p: Path) -> str:
if root is not None:
try:
return p.relative_to(root).as_posix()
except ValueError:
pass
return p.as_posix()
return {
"tolerance": self.tolerance,
"aggregate": {
"hard_cuts": self.hard_cuts.to_dict(),
"mean_abs_offset_hard_cuts": self.mean_abs_offset_hard_cuts,
"fades": self.fades.to_dict(),
"elapsed_total": self.elapsed_total,
"elapsed_mean": self.elapsed_mean,
"video_count": len(self.per_video),
},
"per_video": {_fmt_path(path): v.to_dict() for path, v in self.per_video.items()},
}
def _score_hard_cuts(
predicted_cuts: Iterable[Frames],
ground_truth_cuts: Iterable[Frames],
tolerance: Frames,
) -> tuple[EventMetrics, list[Frames]]:
"""Greedy 1-to-1 nearest-neighbor matching within ``tolerance`` frames.
Builds the set of all (prediction, ground-truth) candidate pairs whose absolute frame distance
is within tolerance, sorts by distance, and walks the sorted list claiming the first unused
pair each time. Ties on distance are broken by stable iteration order, which is deterministic
but otherwise unspecified - fine since we report aggregate metrics, not per-event assignments.
Returns the event metrics and the per-match absolute offsets (for later averaging).
"""
predicted_cuts = list(predicted_cuts)
ground_truth_cuts = list(ground_truth_cuts)
candidates: list[tuple[int, int, int]] = []
for i, p in enumerate(predicted_cuts):
for j, g in enumerate(ground_truth_cuts):
d = abs(p - g)
if d <= tolerance:
candidates.append((d, i, j))
candidates.sort()
prediction_used = [False] * len(predicted_cuts)
ground_truth_used = [False] * len(ground_truth_cuts)
offsets: list[int] = []
for d, i, j in candidates:
if not prediction_used[i] and not ground_truth_used[j]:
prediction_used[i] = True
ground_truth_used[j] = True
offsets.append(d)
matched = len(offsets)
return (
EventMetrics(
matched=matched,
false_positives=len(predicted_cuts) - matched,
missed=len(ground_truth_cuts) - matched,
),
offsets,
)
def _score_fade_transitions(
predicted_cuts: Iterable[Frames],
intervals: Iterable[EventInterval],
) -> tuple[EventMetrics, set[int]]:
"""Point-in-interval matching for gradual fade transitions.
Each prediction that falls inside any ground-truth interval is consumed by that interval
(first-match wins). The first prediction to land in an interval is the match; any further
predictions in the same interval are false positives. Predictions outside every interval are
not touched here - they go back to the hard-cut scorer.
Returns the fade transition metrics and the set of *positional indices* (into
``predicted_cuts``, not frame values) that were consumed by a fade interval, so the caller
can skip them when running hard matching.
"""
predicted_cuts = list(predicted_cuts)
intervals = list(intervals)
consumed: set[int] = set()
intervals_matched: set[EventInterval] = set()
matched = 0
false_positives = 0
for k, p in enumerate(predicted_cuts):
for interval in intervals:
if interval.contains(p):
consumed.add(k)
if interval in intervals_matched:
false_positives += 1
else:
intervals_matched.add(interval)
matched += 1
break
missed = len(intervals) - matched
return (
EventMetrics(matched=matched, false_positives=false_positives, missed=missed),
consumed,
)
def score_video(
predicted_cuts: Iterable[Frames],
ground_truth: GroundTruth,
tolerance: Frames,
elapsed: float,
) -> VideoMetrics:
"""Score one video against typed ground truth at one tolerance.
Fade transition matching runs first; predictions that land inside any fade interval are
consumed there and excluded from hard-cut matching. The remaining predictions are matched
against ground-truth hard cuts at ``tolerance`` frames.
"""
predicted_cuts = list(predicted_cuts)
fade_metrics, consumed = _score_fade_transitions(predicted_cuts, ground_truth.fades)
remaining_cuts = [p for k, p in enumerate(predicted_cuts) if k not in consumed]
hard_metrics, offsets = _score_hard_cuts(remaining_cuts, ground_truth.hard_cuts, tolerance)
return VideoMetrics(
elapsed=elapsed,
category=ground_truth.category,
hard_cuts=hard_metrics,
fades=fade_metrics,
hard_offset=(float(sum(offsets)), len(offsets)),
)
def evaluate(predictions: dict[Path, Prediction], tolerance: Frames) -> BenchmarkResult:
"""Score predictions at a single tolerance and return aggregate + per-video results."""
assert predictions, "predictions must not be empty"
videos = {
path: score_video(
predicted_cuts=p.predicted_cuts,
ground_truth=p.ground_truth,
tolerance=tolerance,
elapsed=p.elapsed,
)
for path, p in predictions.items()
}
return BenchmarkResult(per_video=videos, tolerance=tolerance)