Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0a9e96d

Browse files
committed
Route HF inference through /v1 auto-router + add reasoning_effort knob
Users paste bare HF model ids (MiniMaxAI/MiniMax-M2.7, moonshotai/Kimi-K2.6) with an optional :fastest|cheapest|preferred|<provider> suffix; the router picks a provider and handles failover. /model does a live preflight against /v1/models and prints providers, pricing, context, tool support — warn-and- allow for unknowns with fuzzy suggestions. Friendly messages replace LiteLLM's raw traceback for model/provider mismatches, and the noisy 'Give Feedback' banner is suppressed. Adds a reasoning_effort config + /effort command (default high). OpenAI and Anthropic get the top-level param natively; HF router gets it via extra_body with minimal->low normalization for models like MiniMax M2 that require reasoning. Frontend + backend model selectors updated to the bare-id format.
1 parent 1c0de34 commit 0a9e96d

11 files changed

Lines changed: 431 additions & 152 deletions

File tree

‎agent/config.py‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@ class Config(BaseModel):
3333
confirm_cpu_jobs: bool = True
3434
auto_file_upload: bool = False
3535

36+
# Reasoning effort for models that support it (GPT-5 / o-series, Claude
37+
# extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
38+
# Defaults to "high" — we'd rather spend tokens thinking than ship a
39+
# wrong ML recipe. Users can dial down with `/effort low|medium|off`.
40+
# "minimal" is an OpenAI-only level and is normalized to "low" for HF
41+
# router models (MiniMax requires ≥low). Ignored for non-reasoning models.
42+
# Valid values: None | "minimal" | "low" | "medium" | "high"
43+
reasoning_effort: str | None = "high"
44+
3645

3746
def substitute_env_vars(obj: Any) -> Any:
3847
"""

‎agent/context_manager/manager.py‎

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -306,19 +306,14 @@ async def compact(
306306
)
307307
)
308308

309-
hf_key = (
310-
os.environ.get("INFERENCE_TOKEN")
311-
or hf_token
312-
or os.environ.get("HF_TOKEN")
313-
)
309+
from agent.core.llm_params import _resolve_llm_params
310+
311+
llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
314312
response = await acompletion(
315-
model=model_name,
316313
messages=messages_to_summarize,
317314
max_completion_tokens=self.compact_size,
318315
tools=tool_specs,
319-
api_key=hf_key
320-
if hf_key and model_name.startswith("huggingface/")
321-
else None,
316+
**llm_params,
322317
)
323318
summarized_message = Message(
324319
role="assistant", content=response.choices[0].message.content

‎agent/core/agent_loop.py‎

Lines changed: 23 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from agent.config import Config
1515
from agent.core.doom_loop import check_for_doom_loop
16+
from agent.core.llm_params import _resolve_llm_params
1617
from agent.core.session import Event, OpType, Session
1718
from agent.core.tools import ToolRouter
1819
from agent.tools.jobs_tool import CPU_FLAVORS
@@ -22,51 +23,6 @@
2223
ToolCall = ChatCompletionMessageToolCall
2324

2425

25-
def _resolve_hf_router_params(
26-
model_name: str, session_hf_token: str | None = None
27-
) -> dict:
28-
"""
29-
Build LiteLLM kwargs for HuggingFace Router models.
30-
31-
api-inference.huggingface.co is deprecated; the new router lives at
32-
router.huggingface.co/<provider>/v3/openai. LiteLLM's built-in
33-
``huggingface/`` provider still targets the old endpoint, so we
34-
rewrite model names to ``openai/`` and supply the correct api_base.
35-
36-
Input format: huggingface/<router_provider>/<org>/<model>
37-
Example: huggingface/novita/moonshotai/kimi-k2.5
38-
39-
Token resolution (first non-empty wins):
40-
1. INFERENCE_TOKEN env — shared key on the hosted Space so inference
41-
is free for users and billed to the Space owner.
42-
2. session.hf_token — the user's own token (CLI or self-hosted),
43-
resolved from env / huggingface-cli login / cached token file.
44-
3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
45-
"""
46-
if not model_name.startswith("huggingface/"):
47-
return {"model": model_name}
48-
49-
parts = model_name.split(
50-
"/", 2
51-
) # ['huggingface', 'novita', 'moonshotai/kimi-k2.5']
52-
if len(parts) < 3:
53-
return {"model": model_name}
54-
55-
router_provider = parts[1]
56-
actual_model = parts[2]
57-
api_key = (
58-
os.environ.get("INFERENCE_TOKEN")
59-
or session_hf_token
60-
or os.environ.get("HF_TOKEN")
61-
)
62-
63-
return {
64-
"model": f"openai/{actual_model}",
65-
"api_base": f"https://router.huggingface.co/{router_provider}/v3/openai",
66-
"api_key": api_key,
67-
}
68-
69-
7026
def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
7127
"""
7228
Validate tool arguments structure.
@@ -201,6 +157,24 @@ def _friendly_error_message(error: Exception) -> str | None:
201157
"at your model provider's dashboard."
202158
)
203159

160+
if "not supported by provider" in err_str or "no provider supports" in err_str:
161+
return (
162+
"The model isn't served by the provider you pinned.\n\n"
163+
"Drop the ':<provider>' suffix to let the HF router auto-pick a "
164+
"provider, or use '/model' (no arg) to see which providers host "
165+
"which models."
166+
)
167+
168+
if "model_not_found" in err_str or (
169+
"model" in err_str
170+
and ("not found" in err_str or "does not exist" in err_str)
171+
):
172+
return (
173+
"Model not found. Use '/model' to list suggestions, or paste an "
174+
"HF model id like 'MiniMaxAI/MiniMax-M2.7'. Availability is shown "
175+
"when you switch."
176+
)
177+
204178
return None
205179

206180

@@ -518,8 +492,10 @@ async def run_agent(
518492
tools = session.tool_router.get_tool_specs_for_llm()
519493
try:
520494
# ── Call the LLM (streaming or non-streaming) ──
521-
llm_params = _resolve_hf_router_params(
522-
session.config.model_name, session.hf_token
495+
llm_params = _resolve_llm_params(
496+
session.config.model_name,
497+
session.hf_token,
498+
reasoning_effort=session.config.reasoning_effort,
523499
)
524500
if session.stream:
525501
llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Fetch and cache the HF Inference Router model catalog.
2+
3+
The router exposes an OpenAI-compatible listing at
4+
``https://router.huggingface.co/v1/models`` with per-provider availability,
5+
pricing, context length, and tool-use support. We use it to:
6+
7+
• Validate ``/model`` switches with live data instead of a hard-coded allowlist.
8+
• Show the user which providers serve a model, at what price, and whether they
9+
support tool calls.
10+
• Derive a reasonable context-window limit for any routed model.
11+
12+
The listing is cached in-memory for a few minutes so repeated lookups during a
13+
session are free. On fetch failure we return stale data if we have it, or an
14+
empty catalog otherwise.
15+
"""
16+
17+
import logging
18+
import time
19+
from dataclasses import dataclass
20+
from difflib import get_close_matches
21+
from typing import Optional
22+
23+
import httpx
24+
25+
logger = logging.getLogger(__name__)
26+
27+
_CATALOG_URL = "https://router.huggingface.co/v1/models"
28+
_CACHE_TTL_SECONDS = 300
29+
_HTTP_TIMEOUT_SECONDS = 5.0
30+
31+
_cache: Optional[dict] = None
32+
_cache_time: float = 0.0
33+
34+
35+
@dataclass
36+
class ProviderInfo:
37+
provider: str
38+
status: str
39+
context_length: Optional[int]
40+
input_price: Optional[float]
41+
output_price: Optional[float]
42+
supports_tools: bool
43+
supports_structured_output: bool
44+
45+
46+
@dataclass
47+
class ModelInfo:
48+
id: str
49+
providers: list[ProviderInfo]
50+
51+
@property
52+
def live_providers(self) -> list[ProviderInfo]:
53+
return [p for p in self.providers if p.status == "live"]
54+
55+
@property
56+
def max_context_length(self) -> Optional[int]:
57+
lengths = [p.context_length for p in self.live_providers if p.context_length]
58+
return max(lengths) if lengths else None
59+
60+
@property
61+
def any_supports_tools(self) -> bool:
62+
return any(p.supports_tools for p in self.live_providers)
63+
64+
65+
def _fetch_catalog(force: bool = False) -> dict:
66+
global _cache, _cache_time
67+
now = time.time()
68+
if not force and _cache is not None and now - _cache_time < _CACHE_TTL_SECONDS:
69+
return _cache
70+
try:
71+
resp = httpx.get(_CATALOG_URL, timeout=_HTTP_TIMEOUT_SECONDS)
72+
resp.raise_for_status()
73+
_cache = resp.json()
74+
_cache_time = now
75+
except Exception as e:
76+
logger.warning("Failed to fetch HF router catalog: %s", e)
77+
if _cache is None:
78+
_cache = {"data": []}
79+
_cache_time = now
80+
return _cache
81+
82+
83+
def _parse_entry(entry: dict) -> ModelInfo:
84+
providers = []
85+
for p in entry.get("providers", []) or []:
86+
pricing = p.get("pricing") or {}
87+
providers.append(
88+
ProviderInfo(
89+
provider=p.get("provider", ""),
90+
status=p.get("status", ""),
91+
context_length=p.get("context_length"),
92+
input_price=pricing.get("input"),
93+
output_price=pricing.get("output"),
94+
supports_tools=bool(p.get("supports_tools", False)),
95+
supports_structured_output=bool(p.get("supports_structured_output", False)),
96+
)
97+
)
98+
return ModelInfo(id=entry.get("id", ""), providers=providers)
99+
100+
101+
def lookup(model_id: str) -> Optional[ModelInfo]:
102+
"""Find a model in the router catalog.
103+
104+
Accepts ``<org>/<model>`` or ``<org>/<model>:<tag>`` — the tag is stripped
105+
for lookup. Returns ``None`` if the model isn't listed.
106+
"""
107+
bare = model_id.split(":", 1)[0]
108+
catalog = _fetch_catalog()
109+
for entry in catalog.get("data", []):
110+
if entry.get("id") == bare:
111+
return _parse_entry(entry)
112+
return None
113+
114+
115+
def fuzzy_suggest(model_id: str, limit: int = 3) -> list[str]:
116+
"""Return the closest model ids from the catalog."""
117+
bare = model_id.split(":", 1)[0]
118+
catalog = _fetch_catalog()
119+
ids = [e.get("id", "") for e in catalog.get("data", []) if e.get("id")]
120+
return get_close_matches(bare, ids, n=limit, cutoff=0.4)
121+
122+
123+
def prewarm() -> None:
124+
"""Fetch the catalog so subsequent lookups are instant. Safe to call from
125+
a background task — swallows failures."""
126+
try:
127+
_fetch_catalog(force=False)
128+
except Exception:
129+
pass

‎agent/core/llm_params.py‎

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""LiteLLM kwargs resolution for the model ids this agent accepts.
2+
3+
Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
4+
can import it without pulling in the whole agent loop / tool router and
5+
creating circular imports.
6+
"""
7+
8+
import os
9+
10+
11+
# HF router reasoning models only accept "low" | "medium" | "high" (e.g.
12+
# MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
13+
# also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
14+
# for HF so the user doesn't get a 400.
15+
_HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
16+
17+
18+
def _resolve_llm_params(
19+
model_name: str,
20+
session_hf_token: str | None = None,
21+
reasoning_effort: str | None = None,
22+
) -> dict:
23+
"""
24+
Build LiteLLM kwargs for a given model id.
25+
26+
• ``anthropic/<model>`` / ``openai/<model>`` — passed straight through; the
27+
user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
28+
up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
29+
(GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
30+
extended-thinking models accept "low" | "medium" | "high" and LiteLLM
31+
translates to the thinking config).
32+
33+
• Anything else is treated as a HuggingFace router id. We hit the
34+
auto-routing OpenAI-compatible endpoint at
35+
``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
36+
per-provider HF adapter entirely. The id can be bare or carry an HF
37+
routing suffix:
38+
39+
MiniMaxAI/MiniMax-M2.7 # auto = fastest + failover
40+
MiniMaxAI/MiniMax-M2.7:cheapest
41+
moonshotai/Kimi-K2.6:novita # pin a specific provider
42+
43+
A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
44+
is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
45+
top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
46+
47+
Token precedence (first non-empty wins):
48+
1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
49+
free for users, billed to the Space owner via ``X-HF-Bill-To``).
50+
2. session.hf_token — the user's own token (CLI / OAuth / cache file).
51+
3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
52+
"""
53+
if model_name.startswith(("anthropic/", "openai/")):
54+
params: dict = {"model": model_name}
55+
if reasoning_effort:
56+
params["reasoning_effort"] = reasoning_effort
57+
return params
58+
59+
hf_model = model_name.removeprefix("huggingface/")
60+
api_key = (
61+
os.environ.get("INFERENCE_TOKEN")
62+
or session_hf_token
63+
or os.environ.get("HF_TOKEN")
64+
)
65+
params = {
66+
"model": f"openai/{hf_model}",
67+
"api_base": "https://router.huggingface.co/v1",
68+
"api_key": api_key,
69+
}
70+
if os.environ.get("INFERENCE_TOKEN"):
71+
params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
72+
if reasoning_effort:
73+
hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
74+
if hf_level in _HF_ALLOWED_EFFORTS:
75+
params["extra_body"] = {"reasoning_effort": hf_level}
76+
return params

0 commit comments

Comments
 (0)