Read max context from litellm + compact at 90% (#54)

akseljoonas · web-flow · commit 28b8f2b6fa4a · 2026-04-22T16:07:23.000+03:00
* Read max context from litellm.get_model_info + compact at 90%

The local _MAX_TOKENS_MAP had Claude Opus 4.6 pinned at 200k, triggering
compaction at ~20% of its real 1M window. Swap the hand-maintained table
for litellm.get_model_info()['max_input_tokens'], which LiteLLM keeps
in sync upstream (Opus 4.6=1M, GPT-5=272k, Sonnet 4.5=200k, etc.). HF
router-only ids (MiniMax, Kimi, GLM) aren't in litellm's catalog and
fall through to the 200k default — close enough to their advertised
ranges and safe if the model lies.

Also shifts compaction to fire at 90% of max_context instead of &gt;100%.
The old condition waited until context had already overshot, risking
the next LLM call hitting ContextWindowExceededError before the compact
finished. 90% gives headroom for the summary call + one more turn.

* Drop the legacy -10k buffer — the 90% ratio is the headroom now

ContextManager was subtracting a fixed 10k tokens from max_context on top
of the new 90% compaction threshold, so a 1M-window Opus was triggering
at 891k instead of the intended 900k. Keep max_context == the real model
ceiling; _COMPACT_THRESHOLD_RATIO is the single source of headroom.

* Rename max_context -&gt; model_max_tokens

'max_context' read ambiguously — some reviewers assumed it meant the
compaction threshold. Rename to 'model_max_tokens' so it's unmistakably
the model's real input-token ceiling (what litellm.get_model_info
reports), distinct from the internally-computed compaction threshold.

Touches the ContextManager attribute + ctor param, Session's constructor
kwarg and update_model setter, and the agent loop's debug/warning logs.
No behavior change.

* Simplify compaction gate + rename context_length -&gt; running_context_usage

- ContextManager gains compaction_threshold and needs_compaction
  properties so callers stop recomputing "90% of model_max_tokens" by
  hand. compact() becomes `if not self.needs_compaction: return`.
- Rename self.context_length to self.running_context_usage. The old
  name read like a second ceiling value; the new name says what it is —
  the last-reported total_tokens from usage. add_message(), the
  ContextWindowExceededError handler, and the compact-finish recompute
  all updated accordingly.
- Collapsed _compact_and_notify's local-alias dance (old_length /
  max_ctx / threshold) into a single `cm = session.context_manager`.
diff --git a/agent/context_manager/manager.py b/agent/context_manager/manager.py
@@ -73,7 +73,7 @@ class ContextManager:
 
     def __init__(
         self,
-        max_context: int = 180_000,
+        model_max_tokens: int = 180_000,
         compact_size: float = 0.1,
         untouched_messages: int = 5,
         tool_specs: list[dict[str, Any]] | None = None,
@@ -87,9 +87,15 @@ def __init__(
             hf_token=hf_token,
             local_mode=local_mode,
         )
-        self.max_context = max_context - 10000
-        self.compact_size = int(max_context * compact_size)
-        self.context_length = 0  # Updated after each LLM call with actual usage
+        # The model's real input-token ceiling (from litellm.get_model_info).
+        # Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
+        # the compaction_threshold property.
+        self.model_max_tokens = model_max_tokens
+        self.compact_size = int(model_max_tokens * compact_size)
+        # Running count of tokens the last LLM call reported. Drives the
+        # compaction gate; updated in add_message() with each response's
+        # usage.total_tokens.
+        self.running_context_usage = 0
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
 
@@ -149,7 +155,7 @@ def _load_system_prompt(
     def add_message(self, message: Message, token_count: int = None) -> None:
         """Add a message to the history"""
         if token_count:
-            self.context_length = token_count
+            self.running_context_usage = token_count
         self.items.append(message)
 
     def get_messages(self) -> list[Message]:
@@ -262,14 +268,27 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
                 count += 1
         return False
 
+    # Compaction fires at 90% of model_max_tokens so there's headroom for
+    # the next turn's prompt + response before we actually hit the ceiling.
+    _COMPACT_THRESHOLD_RATIO = 0.9
+
+    @property
+    def compaction_threshold(self) -> int:
+        """Token count at which `compact()` kicks in."""
+        return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
+
+    @property
+    def needs_compaction(self) -> bool:
+        return self.running_context_usage > self.compaction_threshold and bool(self.items)
+
     async def compact(
         self,
         model_name: str,
         tool_specs: list[dict] | None = None,
         hf_token: str | None = None,
     ) -> None:
         """Remove old messages to keep history under target size"""
-        if (self.context_length <= self.max_context) or not self.items:
+        if not self.needs_compaction:
             return
 
         system_msg = (
@@ -325,6 +344,6 @@ async def compact(
             head.append(first_user_msg)
         self.items = head + [summarized_message] + recent_messages
 
-        self.context_length = (
+        self.running_context_usage = (
             len(self.system_prompt) // 4 + response.usage.completion_tokens
         )
diff --git a/agent/core/agent_loop.py b/agent/core/agent_loop.py
@@ -180,29 +180,27 @@ def _friendly_error_message(error: Exception) -> str | None:
 
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
-    old_length = session.context_manager.context_length
-    max_ctx = session.context_manager.max_context
+    cm = session.context_manager
+    old_usage = cm.running_context_usage
     logger.debug(
-        "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
-        old_length, max_ctx, old_length > max_ctx,
+        "Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
+        old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
     )
-    tool_specs = session.tool_router.get_tool_specs_for_llm()
-    await session.context_manager.compact(
+    await cm.compact(
         model_name=session.config.model_name,
-        tool_specs=tool_specs,
+        tool_specs=session.tool_router.get_tool_specs_for_llm(),
         hf_token=session.hf_token,
     )
-    new_length = session.context_manager.context_length
-    if new_length != old_length:
+    new_usage = cm.running_context_usage
+    if new_usage != old_usage:
         logger.warning(
             "Context compacted: %d -> %d tokens (max=%d, %d messages)",
-            old_length, new_length, max_ctx,
-            len(session.context_manager.items),
+            old_usage, new_usage, cm.model_max_tokens, len(cm.items),
         )
         await session.send_event(
             Event(
                 event_type="compacted",
-                data={"old_tokens": old_length, "new_tokens": new_length},
+                data={"old_tokens": old_usage, "new_tokens": new_usage},
             )
         )
 
@@ -576,13 +574,13 @@ async def run_agent(
                     logger.debug(
                         "Agent loop ending: no tool calls. "
                         "finish_reason=%s, token_count=%d, "
-                        "context_length=%d, max_context=%d, "
+                        "usage=%d, model_max_tokens=%d, "
                         "iteration=%d/%d, "
                         "response_text=%s",
                         finish_reason,
                         token_count,
-                        session.context_manager.context_length,
-                        session.context_manager.max_context,
+                        session.context_manager.running_context_usage,
+                        session.context_manager.model_max_tokens,
                         iteration,
                         max_iterations,
                         (content or "")[:500],
@@ -785,17 +783,13 @@ async def _exec_tool(
 
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
+                cm = session.context_manager
                 logger.warning(
                     "ContextWindowExceededError at iteration %d — forcing compaction "
-                    "(context_length=%d, max_context=%d, messages=%d)",
-                    iteration,
-                    session.context_manager.context_length,
-                    session.context_manager.max_context,
-                    len(session.context_manager.items),
-                )
-                session.context_manager.context_length = (
-                    session.context_manager.max_context + 1
+                    "(usage=%d, model_max_tokens=%d, messages=%d)",
+                    iteration, cm.running_context_usage, cm.model_max_tokens, len(cm.items),
                 )
+                cm.running_context_usage = cm.model_max_tokens + 1
                 await _compact_and_notify(session)
                 continue
 
diff --git a/agent/core/session.py b/agent/core/session.py
@@ -15,53 +15,37 @@
 
 logger = logging.getLogger(__name__)
 
-# Local max-token lookup — avoids litellm.get_max_tokens() which can hang
-# on network calls for certain providers (known litellm issue).
-_MAX_TOKENS_MAP: dict[str, int] = {
-    "anthropic/claude-opus-4-6": 200_000,
-    "anthropic/claude-opus-4-5-20251101": 200_000,
-    "anthropic/claude-sonnet-4-5-20250929": 200_000,
-    "anthropic/claude-sonnet-4-20250514": 200_000,
-    "anthropic/claude-haiku-3-5-20241022": 200_000,
-    "anthropic/claude-3-5-sonnet-20241022": 200_000,
-    "anthropic/claude-3-opus-20240229": 200_000,
-}
 _DEFAULT_MAX_TOKENS = 200_000
 
 
 def _get_max_tokens_safe(model_name: str) -> int:
-    """Return the max context window for a model.
-
-    Anthropic/OpenAI ids hit the local table; HF router ids ask the catalog
-    (cached) for the max ``context_length`` across live providers. Falls back
-    to ``_DEFAULT_MAX_TOKENS`` if nothing is available.
+    """Return the max input-context tokens for a model.
+
+    Primary source: ``litellm.get_model_info(model)['max_input_tokens']`` —
+    LiteLLM maintains an upstream catalog that knows Claude Opus 4.6 is
+    1M, GPT-5 is 272k, Sonnet 4.5 is 200k, and so on. Strips any HF routing
+    suffix / huggingface/ prefix so tagged ids ('moonshotai/Kimi-K2.6:cheapest')
+    look up the bare model. Falls back to a conservative 200k default for
+    models not in the catalog (typically HF-router-only models).
     """
-    tokens = _MAX_TOKENS_MAP.get(model_name)
-    if tokens:
-        return tokens
+    from litellm import get_model_info
 
-    if not model_name.startswith(("anthropic/", "openai/")):
+    candidates = [model_name]
+    stripped = model_name.removeprefix("huggingface/").split(":", 1)[0]
+    if stripped != model_name:
+        candidates.append(stripped)
+    for candidate in candidates:
         try:
-            from agent.core import hf_router_catalog as cat
-
-            bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
-            info = cat.lookup(bare)
-            if info and info.max_context_length:
-                return info.max_context_length
-        except Exception as e:
-            logger.warning("HF catalog lookup failed for %s: %s", model_name, e)
-
-    try:
-        from litellm import get_max_tokens
-
-        result = get_max_tokens(model_name)
-        if result and isinstance(result, int):
-            return result
-        logger.warning(
-            f"get_max_tokens returned {result} for {model_name}, using default"
-        )
-    except Exception as e:
-        logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
+            info = get_model_info(candidate)
+            max_input = info.get("max_input_tokens") if info else None
+            if isinstance(max_input, int) and max_input > 0:
+                return max_input
+        except Exception:
+            continue
+    logger.info(
+        "No litellm.get_model_info entry for %s, falling back to %d",
+        model_name, _DEFAULT_MAX_TOKENS,
+    )
     return _DEFAULT_MAX_TOKENS
 
 
@@ -101,7 +85,7 @@ def __init__(
         self.stream = stream
         tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
         self.context_manager = context_manager or ContextManager(
-            max_context=_get_max_tokens_safe(config.model_name),
+            model_max_tokens=_get_max_tokens_safe(config.model_name),
             compact_size=0.1,
             untouched_messages=5,
             tool_specs=tool_specs,
@@ -153,7 +137,7 @@ def is_cancelled(self) -> bool:
     def update_model(self, model_name: str) -> None:
         """Switch the active model and update the context window limit."""
         self.config.model_name = model_name
-        self.context_manager.max_context = _get_max_tokens_safe(model_name)
+        self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
 
     def increment_turn(self) -> None:
         """Increment turn counter (called after each user interaction)"""