Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 199 additions & 0 deletions plugins/observability/fallback-alert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""fallback-alert — Telegram notification when Hermes activates provider fallback.

Detects, by comparing (provider, model) seen in successive ``post_api_request``
hook calls within the same session, that Hermes has swapped to a different
provider — which is the signature of an activated fallback in
``run_agent.py::_try_activate_fallback`` after a primary failure (429 / 5xx /
auth-error per ``agent/error_classifier.py``).

The plugin records the (provider, model) of the first API call of a session
as that session's primary; any later call with a different (provider, model)
triggers a Telegram message. Throttled per session.

No imports from Hermes internals. No third-party deps. Activates only when
both Telegram env vars are set; otherwise the hook is a no-op.

Required env vars
-----------------
FALLBACK_ALERT_TELEGRAM_BOT_TOKEN
Bot token, e.g. ``123456:ABC-DEF...``
FALLBACK_ALERT_TELEGRAM_CHAT_ID
Numeric user/group chat id, or ``@channelusername``

Optional env vars
-----------------
FALLBACK_ALERT_THROTTLE_SECONDS
Min seconds between alerts per session. Default 300.
FALLBACK_ALERT_DEBUG
``true`` to log no-op reasons at INFO level.
"""
from __future__ import annotations

import json
import logging
import os
import threading
import time
import urllib.error
import urllib.request
from typing import Dict, Optional, Tuple

logger = logging.getLogger(__name__)

# Module-level state: each entry is per-session.
_PRIMARY_BY_SESSION: Dict[str, Tuple[str, str]] = {}
_LAST_ALERT_BY_SESSION: Dict[str, float] = {}
_STATE_LOCK = threading.Lock()
Comment on lines +44 to +46

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

Per-session state has no eviction path and can grow unbounded.

_PRIMARY_BY_SESSION and _LAST_ALERT_BY_SESSION only grow. In long-lived processes with many sessions, this creates a memory growth risk. Add lifecycle cleanup (e.g., on terminal finish_reason) and/or TTL-based pruning.

Also applies to: 150-176, 190-194

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugins/observability/fallback-alert/__init__.py` around lines 44 - 46,
_PRIMARY_BY_SESSION and _LAST_ALERT_BY_SESSION accumulate indefinitely; add
eviction by removing entries when a session reaches a terminal finish_reason and
add TTL-based pruning for stale sessions. Modify the code paths that handle
session completion (where finish_reason is observed) to acquire _STATE_LOCK and
pop the session key from _PRIMARY_BY_SESSION and _LAST_ALERT_BY_SESSION;
additionally add a periodic cleanup (background thread or scheduled task) that
scans keys under _STATE_LOCK and removes entries older than a configurable TTL
(use timestamps stored in _LAST_ALERT_BY_SESSION or a new _SESSION_LAST_ACTIVE
map). Ensure all mutations use _STATE_LOCK for thread safety and expose a
configurable TTL constant and cleanup interval so memory growth is bounded.



def _env(name: str, default: str = "") -> str:
return os.environ.get(name, default).strip()


def _debug_enabled() -> bool:
return _env("FALLBACK_ALERT_DEBUG").lower() in {"1", "true", "yes", "on"}


def _throttle_seconds() -> int:
try:
return max(1, int(_env("FALLBACK_ALERT_THROTTLE_SECONDS", "300")))
except ValueError:
return 300


def _credentials() -> Optional[Tuple[str, str]]:
token = _env("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN")
chat_id = _env("FALLBACK_ALERT_TELEGRAM_CHAT_ID")
if not token or not chat_id:
return None
return token, chat_id


def _send_telegram(token: str, chat_id: str, text: str) -> bool:
"""POST to Telegram Bot API. Never raises. Returns True on success."""
try:
url = f"https://api.telegram.org/bot{token}/sendMessage"
body = json.dumps(
{
"chat_id": chat_id,
"text": text,
"parse_mode": "Markdown",
"disable_web_page_preview": True,
}
).encode("utf-8")
req = urllib.request.Request(url, data=body, method="POST")
req.add_header("Content-Type", "application/json")
with urllib.request.urlopen(req, timeout=8) as resp:
if resp.status >= 300:
logger.warning(
"fallback-alert: telegram returned HTTP %d", resp.status
)
return False
return True
except urllib.error.HTTPError as exc:
logger.warning(
"fallback-alert: telegram HTTPError %d: %s",
exc.code,
exc.read()[:200].decode("utf-8", "replace"),
)
except Exception as exc:
logger.warning("fallback-alert: telegram send failed: %s", exc)
return False


def _format_message(
*,
session_id: str,
platform: str,
primary: Tuple[str, str],
current: Tuple[str, str],
finish_reason: str = "",
) -> str:
p_provider, p_model = primary
c_provider, c_model = current
session_short = (session_id[:24] + "…") if len(session_id) > 24 else session_id
lines = [
"*Hermes fallback activated*",
f"*session:* `{session_short or '<no session>'}`",
]
if platform:
lines.append(f"*platform:* `{platform}`")
lines.append(f"*primary:* `{p_provider}/{p_model}`")
lines.append(f"*now:* `{c_provider}/{c_model}`")
if finish_reason:
lines.append(f"*finish_reason:* `{finish_reason}`")
return "\n".join(lines)


def on_post_api_request(**kwargs) -> None:
"""Hook handler. Fires after each API call regardless of outcome.

The handler never raises — any error is logged and swallowed so the
plugin can never crash Hermes' main request loop.
"""
try:
creds = _credentials()
if creds is None:
if _debug_enabled():
logger.info("fallback-alert: no credentials configured, skipping")
return

session_id = (kwargs.get("session_id") or "").strip()
provider = (kwargs.get("provider") or "").strip()
model = (kwargs.get("model") or "").strip()
if not provider or not model:
return

current = (provider, model)
primary: Optional[Tuple[str, str]] = None

with _STATE_LOCK:
stored = _PRIMARY_BY_SESSION.get(session_id)
if stored is None:
_PRIMARY_BY_SESSION[session_id] = current
Comment on lines +141 to +153

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Skip tracking when session_id is absent to prevent cross-session false alerts.

Right now, missing session_id collapses all such calls into the same "" bucket, which can produce incorrect fallback alerts and throttle behavior across unrelated requests.

Suggested guard
         session_id = (kwargs.get("session_id") or "").strip()
         provider = (kwargs.get("provider") or "").strip()
         model = (kwargs.get("model") or "").strip()
+        if not session_id:
+            if _debug_enabled():
+                logger.info("fallback-alert: missing session_id, skipping")
+            return
         if not provider or not model:
             return
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugins/observability/fallback-alert/__init__.py` around lines 141 - 153,
session_id is normalized to "" when missing which causes all anon calls to share
a single key in _PRIMARY_BY_SESSION; change the guard to skip tracking when
session_id is empty by returning early if not session_id (i.e., after computing
session_id, check if it's falsy and return), so the block that acquires
_STATE_LOCK and mutates _PRIMARY_BY_SESSION only runs for real session IDs and
avoids collapsing unrelated requests into the same "" bucket.

if _debug_enabled():
logger.info(
"fallback-alert: recorded primary %s for session %r",
current,
session_id,
)
return
if stored == current:
return # still on primary — silent
primary = stored

now = time.time()
last = _LAST_ALERT_BY_SESSION.get(session_id, 0.0)
if (now - last) < _throttle_seconds():
if _debug_enabled():
logger.info(
"fallback-alert: throttled (%.0fs since last alert for session %r)",
now - last,
session_id,
)
return
_LAST_ALERT_BY_SESSION[session_id] = now

token, chat_id = creds
text = _format_message(
session_id=session_id,
platform=str(kwargs.get("platform") or ""),
primary=primary,
current=current,
finish_reason=str(kwargs.get("finish_reason") or ""),
)
_send_telegram(token, chat_id, text)
except Exception as exc:
logger.warning("fallback-alert: hook handler failed: %s", exc)


def _reset_state_for_tests() -> None:
"""Test helper — clears in-memory state."""
with _STATE_LOCK:
_PRIMARY_BY_SESSION.clear()
_LAST_ALERT_BY_SESSION.clear()


def register(ctx) -> None:
"""Plugin entrypoint, called by the Hermes plugin manager on activation."""
ctx.register_hook("post_api_request", on_post_api_request)
9 changes: 9 additions & 0 deletions plugins/observability/fallback-alert/plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: fallback-alert
version: "1.0.0"
description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in the post_api_request hook differs from the configured primary (model.provider + model.default in config.yaml). No-op when its required env vars are missing."

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Manifest description does not match runtime detection logic.

Line 3 says fallback is detected against configured primary (config.yaml), but the plugin implementation detects fallback against the first observed (provider, model) in a session. This mismatch can cause operator confusion.

Suggested manifest wording update
-description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in the post_api_request hook differs from the configured primary (model.provider + model.default in config.yaml). No-op when its required env vars are missing."
+description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in post_api_request differs from the first observed provider/model for that session. No-op when required env vars are missing."
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in the post_api_request hook differs from the configured primary (model.provider + model.default in config.yaml). No-op when its required env vars are missing."
description: "Optional plugin — sends a Telegram notification when Hermes activates a provider fallback. Detects mid-session that the provider/model in post_api_request differs from the first observed provider/model for that session. No-op when required env vars are missing."
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@plugins/observability/fallback-alert/plugin.yaml` at line 3, Update the
plugin manifest description to reflect the actual runtime behavior: instead of
claiming fallback is detected against the configured primary in config.yaml,
state that fallback is detected by comparing the current request's
provider/model to the first observed (provider, model) for the session (as
implemented in the post_api_request hook) and that detection is a no-op when env
vars are missing; edit the description string in plugin.yaml accordingly so
operators aren't misled.

author: hermes-community
requires_env:
- FALLBACK_ALERT_TELEGRAM_BOT_TOKEN
- FALLBACK_ALERT_TELEGRAM_CHAT_ID
hooks:
- post_api_request
173 changes: 173 additions & 0 deletions tests/plugins/test_fallback_alert_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""Tests for the bundled observability/fallback-alert plugin."""
from __future__ import annotations

import importlib
import importlib.util
import sys
import types
from pathlib import Path
from unittest.mock import patch

import pytest
import yaml


REPO_ROOT = Path(__file__).resolve().parents[2]
PLUGIN_DIR = REPO_ROOT / "plugins" / "observability" / "fallback-alert"


def _load_plugin():
"""Load the plugin __init__.py directly — the hyphen in the directory
name prevents a regular ``import plugins.observability.fallback-alert``."""
if "hermes_plugins_under_test" not in sys.modules:
ns = types.ModuleType("hermes_plugins_under_test")
ns.__path__ = []
sys.modules["hermes_plugins_under_test"] = ns
mod_name = "hermes_plugins_under_test.fallback_alert"
if mod_name in sys.modules:
del sys.modules[mod_name]
spec = importlib.util.spec_from_file_location(
mod_name,
PLUGIN_DIR / "__init__.py",
)
mod = importlib.util.module_from_spec(spec)
sys.modules[mod_name] = mod
spec.loader.exec_module(mod)
return mod


@pytest.fixture
def plugin(monkeypatch):
"""Fresh module + cleared state + cleared env vars for each test."""
for var in (
"FALLBACK_ALERT_TELEGRAM_BOT_TOKEN",
"FALLBACK_ALERT_TELEGRAM_CHAT_ID",
"FALLBACK_ALERT_THROTTLE_SECONDS",
"FALLBACK_ALERT_DEBUG",
):
monkeypatch.delenv(var, raising=False)
mod = _load_plugin()
mod._reset_state_for_tests()
return mod


# ---------------------------------------------------------------------------
# Manifest
# ---------------------------------------------------------------------------

class TestManifest:
def test_directory_layout(self):
assert PLUGIN_DIR.is_dir()
assert (PLUGIN_DIR / "plugin.yaml").exists()
assert (PLUGIN_DIR / "__init__.py").exists()

def test_manifest_fields(self):
data = yaml.safe_load((PLUGIN_DIR / "plugin.yaml").read_text())
assert data["name"] == "fallback-alert"
assert data["version"]
assert data["hooks"] == ["post_api_request"]
assert set(data["requires_env"]) == {
"FALLBACK_ALERT_TELEGRAM_BOT_TOKEN",
"FALLBACK_ALERT_TELEGRAM_CHAT_ID",
}


# ---------------------------------------------------------------------------
# Hook behaviour
# ---------------------------------------------------------------------------

class TestHookBehaviour:
def test_noop_when_credentials_missing(self, plugin):
"""No Telegram env vars => hook is silent, no _send_telegram call."""
with patch.object(plugin, "_send_telegram") as sender:
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
sender.assert_not_called()

def test_first_call_records_primary_no_alert(self, plugin, monkeypatch):
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok")
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234")
with patch.object(plugin, "_send_telegram") as sender:
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
sender.assert_not_called()
assert plugin._PRIMARY_BY_SESSION["s1"] == (
"anthropic", "claude-haiku-4-5-20251001",
)

def test_same_provider_model_no_alert(self, plugin, monkeypatch):
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok")
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234")
with patch.object(plugin, "_send_telegram") as sender:
for _ in range(3):
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
sender.assert_not_called()

def test_different_provider_triggers_alert(self, plugin, monkeypatch):
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok")
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234")
with patch.object(plugin, "_send_telegram", return_value=True) as sender:
# First call sets primary.
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
# Second call: different provider — fallback active.
plugin.on_post_api_request(
session_id="s1", provider="openrouter",
model="anthropic/claude-haiku-4-5",
platform="telegram", finish_reason="tool_use",
)
assert sender.call_count == 1
args, _kwargs = sender.call_args
token, chat, text = args
assert token == "bot:tok"
assert chat == "1234"
assert "anthropic/claude-haiku-4-5-20251001" in text
assert "openrouter/anthropic/claude-haiku-4-5" in text
assert "tool_use" in text
assert "telegram" in text

def test_throttle_suppresses_repeated_alerts(self, plugin, monkeypatch):
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok")
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234")
monkeypatch.setenv("FALLBACK_ALERT_THROTTLE_SECONDS", "300")
with patch.object(plugin, "_send_telegram", return_value=True) as sender:
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
for _ in range(5):
plugin.on_post_api_request(
session_id="s1", provider="openrouter",
model="anthropic/claude-haiku-4-5",
)
assert sender.call_count == 1

def test_hook_swallows_exceptions(self, plugin, monkeypatch):
"""Any unexpected error must NOT propagate (would crash Hermes loop)."""
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_BOT_TOKEN", "bot:tok")
monkeypatch.setenv("FALLBACK_ALERT_TELEGRAM_CHAT_ID", "1234")
with patch.object(
plugin, "_send_telegram", side_effect=RuntimeError("boom")
):
plugin.on_post_api_request(
session_id="s1", provider="anthropic", model="claude-haiku-4-5-20251001"
)
# Trigger fallback — _send_telegram raises but hook must not.
plugin.on_post_api_request(
session_id="s1", provider="openrouter",
model="anthropic/claude-haiku-4-5",
)

def test_register_wires_post_api_request(self, plugin):
seen: list[tuple[str, callable]] = []

class Ctx:
def register_hook(self, name, cb):
seen.append((name, cb))

plugin.register(Ctx())
assert seen == [("post_api_request", plugin.on_post_api_request)]
Loading