Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 28b8f2b

Browse files
authored
Read max context from litellm + compact at 90% (#54)
* Read max context from litellm.get_model_info + compact at 90% The local _MAX_TOKENS_MAP had Claude Opus 4.6 pinned at 200k, triggering compaction at ~20% of its real 1M window. Swap the hand-maintained table for litellm.get_model_info()['max_input_tokens'], which LiteLLM keeps in sync upstream (Opus 4.6=1M, GPT-5=272k, Sonnet 4.5=200k, etc.). HF router-only ids (MiniMax, Kimi, GLM) aren't in litellm's catalog and fall through to the 200k default — close enough to their advertised ranges and safe if the model lies. Also shifts compaction to fire at 90% of max_context instead of >100%. The old condition waited until context had already overshot, risking the next LLM call hitting ContextWindowExceededError before the compact finished. 90% gives headroom for the summary call + one more turn. * Drop the legacy -10k buffer — the 90% ratio is the headroom now ContextManager was subtracting a fixed 10k tokens from max_context on top of the new 90% compaction threshold, so a 1M-window Opus was triggering at 891k instead of the intended 900k. Keep max_context == the real model ceiling; _COMPACT_THRESHOLD_RATIO is the single source of headroom. * Rename max_context -> model_max_tokens 'max_context' read ambiguously — some reviewers assumed it meant the compaction threshold. Rename to 'model_max_tokens' so it's unmistakably the model's real input-token ceiling (what litellm.get_model_info reports), distinct from the internally-computed compaction threshold. Touches the ContextManager attribute + ctor param, Session's constructor kwarg and update_model setter, and the agent loop's debug/warning logs. No behavior change. * Simplify compaction gate + rename context_length -> running_context_usage - ContextManager gains compaction_threshold and needs_compaction properties so callers stop recomputing "90% of model_max_tokens" by hand. compact() becomes `if not self.needs_compaction: return`. - Rename self.context_length to self.running_context_usage. The old name read like a second ceiling value; the new name says what it is — the last-reported total_tokens from usage. add_message(), the ContextWindowExceededError handler, and the compact-finish recompute all updated accordingly. - Collapsed _compact_and_notify's local-alias dance (old_length / max_ctx / threshold) into a single `cm = session.context_manager`.
1 parent 5ab7c4e commit 28b8f2b

3 files changed

Lines changed: 69 additions & 72 deletions

File tree

‎agent/context_manager/manager.py‎

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class ContextManager:
7373

7474
def __init__(
7575
self,
76-
max_context: int = 180_000,
76+
model_max_tokens: int = 180_000,
7777
compact_size: float = 0.1,
7878
untouched_messages: int = 5,
7979
tool_specs: list[dict[str, Any]] | None = None,
@@ -87,9 +87,15 @@ def __init__(
8787
hf_token=hf_token,
8888
local_mode=local_mode,
8989
)
90-
self.max_context = max_context - 10000
91-
self.compact_size = int(max_context * compact_size)
92-
self.context_length = 0 # Updated after each LLM call with actual usage
90+
# The model's real input-token ceiling (from litellm.get_model_info).
91+
# Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
92+
# the compaction_threshold property.
93+
self.model_max_tokens = model_max_tokens
94+
self.compact_size = int(model_max_tokens * compact_size)
95+
# Running count of tokens the last LLM call reported. Drives the
96+
# compaction gate; updated in add_message() with each response's
97+
# usage.total_tokens.
98+
self.running_context_usage = 0
9399
self.untouched_messages = untouched_messages
94100
self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
95101

@@ -149,7 +155,7 @@ def _load_system_prompt(
149155
def add_message(self, message: Message, token_count: int = None) -> None:
150156
"""Add a message to the history"""
151157
if token_count:
152-
self.context_length = token_count
158+
self.running_context_usage = token_count
153159
self.items.append(message)
154160

155161
def get_messages(self) -> list[Message]:
@@ -262,14 +268,27 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
262268
count += 1
263269
return False
264270

271+
# Compaction fires at 90% of model_max_tokens so there's headroom for
272+
# the next turn's prompt + response before we actually hit the ceiling.
273+
_COMPACT_THRESHOLD_RATIO = 0.9
274+
275+
@property
276+
def compaction_threshold(self) -> int:
277+
"""Token count at which `compact()` kicks in."""
278+
return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
279+
280+
@property
281+
def needs_compaction(self) -> bool:
282+
return self.running_context_usage > self.compaction_threshold and bool(self.items)
283+
265284
async def compact(
266285
self,
267286
model_name: str,
268287
tool_specs: list[dict] | None = None,
269288
hf_token: str | None = None,
270289
) -> None:
271290
"""Remove old messages to keep history under target size"""
272-
if (self.context_length <= self.max_context) or not self.items:
291+
if not self.needs_compaction:
273292
return
274293

275294
system_msg = (
@@ -325,6 +344,6 @@ async def compact(
325344
head.append(first_user_msg)
326345
self.items = head + [summarized_message] + recent_messages
327346

328-
self.context_length = (
347+
self.running_context_usage = (
329348
len(self.system_prompt) // 4 + response.usage.completion_tokens
330349
)

‎agent/core/agent_loop.py‎

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -180,29 +180,27 @@ def _friendly_error_message(error: Exception) -> str | None:
180180

181181
async def _compact_and_notify(session: Session) -> None:
182182
"""Run compaction and send event if context was reduced."""
183-
old_length = session.context_manager.context_length
184-
max_ctx = session.context_manager.max_context
183+
cm = session.context_manager
184+
old_usage = cm.running_context_usage
185185
logger.debug(
186-
"Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
187-
old_length, max_ctx, old_length > max_ctx,
186+
"Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
187+
old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
188188
)
189-
tool_specs = session.tool_router.get_tool_specs_for_llm()
190-
await session.context_manager.compact(
189+
await cm.compact(
191190
model_name=session.config.model_name,
192-
tool_specs=tool_specs,
191+
tool_specs=session.tool_router.get_tool_specs_for_llm(),
193192
hf_token=session.hf_token,
194193
)
195-
new_length = session.context_manager.context_length
196-
if new_length != old_length:
194+
new_usage = cm.running_context_usage
195+
if new_usage != old_usage:
197196
logger.warning(
198197
"Context compacted: %d -> %d tokens (max=%d, %d messages)",
199-
old_length, new_length, max_ctx,
200-
len(session.context_manager.items),
198+
old_usage, new_usage, cm.model_max_tokens, len(cm.items),
201199
)
202200
await session.send_event(
203201
Event(
204202
event_type="compacted",
205-
data={"old_tokens": old_length, "new_tokens": new_length},
203+
data={"old_tokens": old_usage, "new_tokens": new_usage},
206204
)
207205
)
208206

@@ -576,13 +574,13 @@ async def run_agent(
576574
logger.debug(
577575
"Agent loop ending: no tool calls. "
578576
"finish_reason=%s, token_count=%d, "
579-
"context_length=%d, max_context=%d, "
577+
"usage=%d, model_max_tokens=%d, "
580578
"iteration=%d/%d, "
581579
"response_text=%s",
582580
finish_reason,
583581
token_count,
584-
session.context_manager.context_length,
585-
session.context_manager.max_context,
582+
session.context_manager.running_context_usage,
583+
session.context_manager.model_max_tokens,
586584
iteration,
587585
max_iterations,
588586
(content or "")[:500],
@@ -785,17 +783,13 @@ async def _exec_tool(
785783

786784
except ContextWindowExceededError:
787785
# Force compact and retry this iteration
786+
cm = session.context_manager
788787
logger.warning(
789788
"ContextWindowExceededError at iteration %d — forcing compaction "
790-
"(context_length=%d, max_context=%d, messages=%d)",
791-
iteration,
792-
session.context_manager.context_length,
793-
session.context_manager.max_context,
794-
len(session.context_manager.items),
795-
)
796-
session.context_manager.context_length = (
797-
session.context_manager.max_context + 1
789+
"(usage=%d, model_max_tokens=%d, messages=%d)",
790+
iteration, cm.running_context_usage, cm.model_max_tokens, len(cm.items),
798791
)
792+
cm.running_context_usage = cm.model_max_tokens + 1
799793
await _compact_and_notify(session)
800794
continue
801795

‎agent/core/session.py‎

Lines changed: 26 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -15,53 +15,37 @@
1515

1616
logger = logging.getLogger(__name__)
1717

18-
# Local max-token lookup — avoids litellm.get_max_tokens() which can hang
19-
# on network calls for certain providers (known litellm issue).
20-
_MAX_TOKENS_MAP: dict[str, int] = {
21-
"anthropic/claude-opus-4-6": 200_000,
22-
"anthropic/claude-opus-4-5-20251101": 200_000,
23-
"anthropic/claude-sonnet-4-5-20250929": 200_000,
24-
"anthropic/claude-sonnet-4-20250514": 200_000,
25-
"anthropic/claude-haiku-3-5-20241022": 200_000,
26-
"anthropic/claude-3-5-sonnet-20241022": 200_000,
27-
"anthropic/claude-3-opus-20240229": 200_000,
28-
}
2918
_DEFAULT_MAX_TOKENS = 200_000
3019

3120

3221
def _get_max_tokens_safe(model_name: str) -> int:
33-
"""Return the max context window for a model.
34-
35-
Anthropic/OpenAI ids hit the local table; HF router ids ask the catalog
36-
(cached) for the max ``context_length`` across live providers. Falls back
37-
to ``_DEFAULT_MAX_TOKENS`` if nothing is available.
22+
"""Return the max input-context tokens for a model.
23+
24+
Primary source: ``litellm.get_model_info(model)['max_input_tokens']`` —
25+
LiteLLM maintains an upstream catalog that knows Claude Opus 4.6 is
26+
1M, GPT-5 is 272k, Sonnet 4.5 is 200k, and so on. Strips any HF routing
27+
suffix / huggingface/ prefix so tagged ids ('moonshotai/Kimi-K2.6:cheapest')
28+
look up the bare model. Falls back to a conservative 200k default for
29+
models not in the catalog (typically HF-router-only models).
3830
"""
39-
tokens = _MAX_TOKENS_MAP.get(model_name)
40-
if tokens:
41-
return tokens
31+
from litellm import get_model_info
4232

43-
if not model_name.startswith(("anthropic/", "openai/")):
33+
candidates = [model_name]
34+
stripped = model_name.removeprefix("huggingface/").split(":", 1)[0]
35+
if stripped != model_name:
36+
candidates.append(stripped)
37+
for candidate in candidates:
4438
try:
45-
from agent.core import hf_router_catalog as cat
46-
47-
bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
48-
info = cat.lookup(bare)
49-
if info and info.max_context_length:
50-
return info.max_context_length
51-
except Exception as e:
52-
logger.warning("HF catalog lookup failed for %s: %s", model_name, e)
53-
54-
try:
55-
from litellm import get_max_tokens
56-
57-
result = get_max_tokens(model_name)
58-
if result and isinstance(result, int):
59-
return result
60-
logger.warning(
61-
f"get_max_tokens returned {result} for {model_name}, using default"
62-
)
63-
except Exception as e:
64-
logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
39+
info = get_model_info(candidate)
40+
max_input = info.get("max_input_tokens") if info else None
41+
if isinstance(max_input, int) and max_input > 0:
42+
return max_input
43+
except Exception:
44+
continue
45+
logger.info(
46+
"No litellm.get_model_info entry for %s, falling back to %d",
47+
model_name, _DEFAULT_MAX_TOKENS,
48+
)
6549
return _DEFAULT_MAX_TOKENS
6650

6751

@@ -101,7 +85,7 @@ def __init__(
10185
self.stream = stream
10286
tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
10387
self.context_manager = context_manager or ContextManager(
104-
max_context=_get_max_tokens_safe(config.model_name),
88+
model_max_tokens=_get_max_tokens_safe(config.model_name),
10589
compact_size=0.1,
10690
untouched_messages=5,
10791
tool_specs=tool_specs,
@@ -153,7 +137,7 @@ def is_cancelled(self) -> bool:
153137
def update_model(self, model_name: str) -> None:
154138
"""Switch the active model and update the context window limit."""
155139
self.config.model_name = model_name
156-
self.context_manager.max_context = _get_max_tokens_safe(model_name)
140+
self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
157141

158142
def increment_turn(self) -> None:
159143
"""Increment turn counter (called after each user interaction)"""

0 commit comments

Comments
 (0)