-
Notifications
You must be signed in to change notification settings - Fork 0
feat(observability): fallback-alert plugin — Telegram notification on provider fallback #12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| """fallback-alert — Telegram notification when Hermes activates provider fallback. | ||
|
|
||
| Detects, by comparing (provider, model) seen in successive ``post_api_request`` | ||
| hook calls within the same session, that Hermes has swapped to a different | ||
| provider — which is the signature of an activated fallback in | ||
| ``run_agent.py::_try_activate_fallback`` after a primary failure (429 / 5xx / | ||
| auth-error per ``agent/error_classifier.py``). | ||
|
|
||
| The plugin records the (provider, model) of the first API call of a session | ||
| as that session's primary; any later call with a different (provider, model) | ||
| triggers a Telegram message. Throttled per session. | ||
|
|
||
| No imports from Hermes internals. No third-party deps. Activates only when | ||
| both Telegram env vars are set; otherwise the hook is a no-op. | ||
|
|
||
| Required env vars | ||
| ----------------- | ||
| FALLBACK_ALERT_TELEGRAM_BOT_TOKEN | ||
| Bot token, e.g. ``123456:ABC-DEF...`` | ||
| FALLBACK_ALERT_TELEGRAM_CHAT_ID | ||
| Numeric user/group chat id, or ``@channelusername`` | ||
|
|
||
| Optional env vars | ||
| ----------------- | ||
| FALLBACK_ALERT_THROTTLE_SECONDS | ||
| Min seconds between alerts per session. Default 300. | ||
| FALLBACK_ALERT_DEBUG | ||
| ``true`` to log no-op reasons at INFO level. | ||
| """ | ||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import logging | ||
| import os | ||
| import threading | ||
| import time | ||
| import urllib.error | ||
| import urllib.request | ||
| from typing import Dict, Optional, Tuple | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Module-level state: each entry is per-session. | ||
| _PRIMARY_BY_SESSION: Dict[str, Tuple[str, str]] = {} | ||
| _LAST_ALERT_BY_SESSION: Dict[str, float] = {} | ||
| _STATE_LOCK = threading.Lock() | ||
|
|
||
|
|
||
| def _env(name: str, default: str = "") -> str: | ||
| return os.environ.get(name, default).strip() | ||
|
|
||
|
|
||
| def _debug_enabled() -> bool: | ||
| return _env("FALLBACK_ALERT_DEBUG").lower() in {"1", "true", "yes", "on"} | ||
|
|
||
|
|
||
| def _throttle_seconds() -> int: | ||
| try: | ||
| return max(1, int(_env("FALLBACK_ALERT_THROTTLE_SECONDS", "300"))) | ||
| except ValueError: | ||
| return 300 | ||
|
|
||
|
|
||
| def _credentials() -> Optional[Tuple[str, str]]: | ||
| token = _env("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN") | ||
| chat_id = _env("FALLBACK_ALERT_TELEGRAM_CHAT_ID") | ||
| if not token or not chat_id: | ||
| return None | ||
| return token, chat_id | ||
|
|
||
|
|
||
| def _send_telegram(token: str, chat_id: str, text: str) -> bool: | ||
| """POST to Telegram Bot API. Never raises. Returns True on success.""" | ||
| try: | ||
| url = f"https://api.telegram.org/bot{token}/sendMessage" | ||
| body = json.dumps( | ||
| { | ||
| "chat_id": chat_id, | ||
| "text": text, | ||
| "parse_mode": "Markdown", | ||
| "disable_web_page_preview": True, | ||
| } | ||
| ).encode("utf-8") | ||
| req = urllib.request.Request(url, data=body, method="POST") | ||
| req.add_header("Content-Type", "application/json") | ||
| with urllib.request.urlopen(req, timeout=8) as resp: | ||
| if resp.status >= 300: | ||
| logger.warning( | ||
| "fallback-alert: telegram returned HTTP %d", resp.status | ||
| ) | ||
| return False | ||
| return True | ||
| except urllib.error.HTTPError as exc: | ||
| logger.warning( | ||
| "fallback-alert: telegram HTTPError %d: %s", | ||
| exc.code, | ||
| exc.read()[:200].decode("utf-8", "replace"), | ||
| ) | ||
| except Exception as exc: | ||
| logger.warning("fallback-alert: telegram send failed: %s", exc) | ||
| return False | ||
|
|
||
|
|
||
| def _format_message( | ||
| *, | ||
| session_id: str, | ||
| platform: str, | ||
| primary: Tuple[str, str], | ||
| current: Tuple[str, str], | ||
| finish_reason: str = "", | ||
| ) -> str: | ||
| p_provider, p_model = primary | ||
| c_provider, c_model = current | ||
| session_short = (session_id[:24] + "…") if len(session_id) > 24 else session_id | ||
| lines = [ | ||
| "*Hermes fallback activated*", | ||
| f"*session:* `{session_short or '<no session>'}`", | ||
| ] | ||
| if platform: | ||
| lines.append(f"*platform:* `{platform}`") | ||
| lines.append(f"*primary:* `{p_provider}/{p_model}`") | ||
| lines.append(f"*now:* `{c_provider}/{c_model}`") | ||
| if finish_reason: | ||
| lines.append(f"*finish_reason:* `{finish_reason}`") | ||
| return "\n".join(lines) | ||
|
|
||
|
|
||
| def on_post_api_request(**kwargs) -> None: | ||
| """Hook handler. Fires after each API call regardless of outcome. | ||
|
|
||
| The handler never raises — any error is logged and swallowed so the | ||
| plugin can never crash Hermes' main request loop. | ||
| """ | ||
| try: | ||
| creds = _credentials() | ||
| if creds is None: | ||
| if _debug_enabled(): | ||
| logger.info("fallback-alert: no credentials configured, skipping") | ||
| return | ||
|
|
||
| session_id = (kwargs.get("session_id") or "").strip() | ||
| provider = (kwargs.get("provider") or "").strip() | ||
| model = (kwargs.get("model") or "").strip() | ||
| if not provider or not model: | ||
| return | ||
|
|
||
| current = (provider, model) | ||
| primary: Optional[Tuple[str, str]] = None | ||
|
|
||
| with _STATE_LOCK: | ||
| stored = _PRIMARY_BY_SESSION.get(session_id) | ||
| if stored is None: | ||
| _PRIMARY_BY_SESSION[session_id] = current | ||
|
Comment on lines
+141
to
+153
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Skip tracking when Right now, missing Suggested guard session_id = (kwargs.get("session_id") or "").strip()
provider = (kwargs.get("provider") or "").strip()
model = (kwargs.get("model") or "").strip()
+ if not session_id:
+ if _debug_enabled():
+ logger.info("fallback-alert: missing session_id, skipping")
+ return
if not provider or not model:
return🤖 Prompt for AI Agents |
||
| if _debug_enabled(): | ||
| logger.info( | ||
| "fallback-alert: recorded primary %s for session %r", | ||
| current, | ||
| session_id, | ||
| ) | ||
| return | ||
| if stored == current: | ||
| return # still on primary — silent | ||
| primary = stored | ||
|
|
||
| now = time.time() | ||
| last = _LAST_ALERT_BY_SESSION.get(session_id, 0.0) | ||
| if (now - last) < _throttle_seconds(): | ||
| if _debug_enabled(): | ||
| logger.info( | ||
| "fallback-alert: throttled (%.0fs since last alert for session %r)", | ||
| now - last, | ||
| session_id, | ||
| ) | ||
| return | ||
| _LAST_ALERT_BY_SESSION[session_id] = now | ||
|
|
||
| token, chat_id = creds | ||
| text = _format_message( | ||
| session_id=session_id, | ||
| platform=str(kwargs.get("platform") or ""), | ||
| primary=primary, | ||
| current=current, | ||
| finish_reason=str(kwargs.get("finish_reason") or ""), | ||
| ) | ||
| _send_telegram(token, chat_id, text) | ||
| except Exception as exc: | ||
| logger.warning("fallback-alert: hook handler failed: %s", exc) | ||
|
|
||
|
|
||
| def _reset_state_for_tests() -> None: | ||
| """Test helper — clears in-memory state.""" | ||
| with _STATE_LOCK: | ||
| _PRIMARY_BY_SESSION.clear() | ||
| _LAST_ALERT_BY_SESSION.clear() | ||
|
|
||
|
|
||
| def register(ctx) -> None: | ||
| """Plugin entrypoint, called by the Hermes plugin manager on activation.""" | ||
| ctx.register_hook("post_api_request", on_post_api_request) | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,9 @@ | ||||||
| name: fallback-alert | ||||||
| version: "1.0.0" | ||||||
| description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in the post_api_request hook differs from the configured primary (model.provider + model.default in config.yaml). No-op when its required env vars are missing." | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Manifest description does not match runtime detection logic. Line 3 says fallback is detected against configured primary ( Suggested manifest wording update-description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in the post_api_request hook differs from the configured primary (model.provider + model.default in config.yaml). No-op when its required env vars are missing."
+description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in post_api_request differs from the first observed provider/model for that session. No-op when required env vars are missing."📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| author: hermes-community | ||||||
| requires_env: | ||||||
| - FALLBACK_ALERT_TELEGRAM_BOT_TOKEN | ||||||
| - FALLBACK_ALERT_TELEGRAM_CHAT_ID | ||||||
| hooks: | ||||||
| - post_api_request | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,173 @@ | ||
| """Tests for the bundled observability/fallback-alert plugin.""" | ||
| from __future__ import annotations | ||
|
|
||
| import importlib | ||
| import importlib.util | ||
| import sys | ||
| import types | ||
| from pathlib import Path | ||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
| import yaml | ||
|
|
||
|
|
||
| REPO_ROOT = Path(__file__).resolve().parents[2] | ||
| PLUGIN_DIR = REPO_ROOT / "plugins" / "observability" / "fallback-alert" | ||
|
|
||
|
|
||
| def _load_plugin(): | ||
| """Load the plugin __init__.py directly — the hyphen in the directory | ||
| name prevents a regular ``import plugins.observability.fallback-alert``.""" | ||
| if "hermes_plugins_under_test" not in sys.modules: | ||
| ns = types.ModuleType("hermes_plugins_under_test") | ||
| ns.__path__ = [] | ||
| sys.modules["hermes_plugins_under_test"] = ns | ||
| mod_name = "hermes_plugins_under_test.fallback_alert" | ||
| if mod_name in sys.modules: | ||
| del sys.modules[mod_name] | ||
| spec = importlib.util.spec_from_file_location( | ||
| mod_name, | ||
| PLUGIN_DIR / "__init__.py", | ||
| ) | ||
| mod = importlib.util.module_from_spec(spec) | ||
| sys.modules[mod_name] = mod | ||
| spec.loader.exec_module(mod) | ||
| return mod | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def plugin(monkeypatch): | ||
| """Fresh module + cleared state + cleared env vars for each test.""" | ||
| for var in ( | ||
| "FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", | ||
| "FALLBACK_ALERT_TELEGRAM_CHAT_ID", | ||
| "FALLBACK_ALERT_THROTTLE_SECONDS", | ||
| "FALLBACK_ALERT_DEBUG", | ||
| ): | ||
| monkeypatch.delenv(var, raising=False) | ||
| mod = _load_plugin() | ||
| mod._reset_state_for_tests() | ||
| return mod | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Manifest | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| class TestManifest: | ||
| def test_directory_layout(self): | ||
| assert PLUGIN_DIR.is_dir() | ||
| assert (PLUGIN_DIR / "plugin.yaml").exists() | ||
| assert (PLUGIN_DIR / "__init__.py").exists() | ||
|
|
||
| def test_manifest_fields(self): | ||
| data = yaml.safe_load((PLUGIN_DIR / "plugin.yaml").read_text()) | ||
| assert data["name"] == "fallback-alert" | ||
| assert data["version"] | ||
| assert data["hooks"] == ["post_api_request"] | ||
| assert set(data["requires_env"]) == { | ||
| "FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", | ||
| "FALLBACK_ALERT_TELEGRAM_CHAT_ID", | ||
| } | ||
|
|
||
|
|
||
| # --------------------------------------------------------------------------- | ||
| # Hook behaviour | ||
| # --------------------------------------------------------------------------- | ||
|
|
||
| class TestHookBehaviour: | ||
| def test_noop_when_credentials_missing(self, plugin): | ||
| """No Telegram env vars => hook is silent, no _send_telegram call.""" | ||
| with patch.object(plugin, "_send_telegram") as sender: | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| sender.assert_not_called() | ||
|
|
||
| def test_first_call_records_primary_no_alert(self, plugin, monkeypatch): | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok") | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234") | ||
| with patch.object(plugin, "_send_telegram") as sender: | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| sender.assert_not_called() | ||
| assert plugin._PRIMARY_BY_SESSION["s1"] == ( | ||
| "anthropic", "claude-haiku-4-5-20251001", | ||
| ) | ||
|
|
||
| def test_same_provider_model_no_alert(self, plugin, monkeypatch): | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok") | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234") | ||
| with patch.object(plugin, "_send_telegram") as sender: | ||
| for _ in range(3): | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| sender.assert_not_called() | ||
|
|
||
| def test_different_provider_triggers_alert(self, plugin, monkeypatch): | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok") | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234") | ||
| with patch.object(plugin, "_send_telegram", return_value=True) as sender: | ||
| # First call sets primary. | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| # Second call: different provider — fallback active. | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="openrouter", | ||
| model="anthropic/claude-haiku-4-5", | ||
| platform="telegram", finish_reason="tool_use", | ||
| ) | ||
| assert sender.call_count == 1 | ||
| args, _kwargs = sender.call_args | ||
| token, chat, text = args | ||
| assert token == "bot:tok" | ||
| assert chat == "1234" | ||
| assert "anthropic/claude-haiku-4-5-20251001" in text | ||
| assert "openrouter/anthropic/claude-haiku-4-5" in text | ||
| assert "tool_use" in text | ||
| assert "telegram" in text | ||
|
|
||
| def test_throttle_suppresses_repeated_alerts(self, plugin, monkeypatch): | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok") | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234") | ||
| monkeypatch.setenv("FALLBACK_ALERT_THROTTLE_SECONDS", "300") | ||
| with patch.object(plugin, "_send_telegram", return_value=True) as sender: | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| for _ in range(5): | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="openrouter", | ||
| model="anthropic/claude-haiku-4-5", | ||
| ) | ||
| assert sender.call_count == 1 | ||
|
|
||
| def test_hook_swallows_exceptions(self, plugin, monkeypatch): | ||
| """Any unexpected error must NOT propagate (would crash Hermes loop).""" | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok") | ||
| monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234") | ||
| with patch.object( | ||
| plugin, "_send_telegram", side_effect=RuntimeError("boom") | ||
| ): | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001" | ||
| ) | ||
| # Trigger fallback — _send_telegram raises but hook must not. | ||
| plugin.on_post_api_request( | ||
| session_id="s1", provider="openrouter", | ||
| model="anthropic/claude-haiku-4-5", | ||
| ) | ||
|
|
||
| def test_register_wires_post_api_request(self, plugin): | ||
| seen: list[tuple[str, callable]] = [] | ||
|
|
||
| class Ctx: | ||
| def register_hook(self, name, cb): | ||
| seen.append((name, cb)) | ||
|
|
||
| plugin.register(Ctx()) | ||
| assert seen == [("post_api_request", plugin.on_post_api_request)] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per-session state has no eviction path and can grow unbounded.
_PRIMARY_BY_SESSIONand_LAST_ALERT_BY_SESSIONonly grow. In long-lived processes with many sessions, this creates a memory growth risk. Add lifecycle cleanup (e.g., on terminalfinish_reason) and/or TTL-based pruning.Also applies to: 150-176, 190-194
🤖 Prompt for AI Agents