huggingface
diff --git a/‎agent/config.py‎
Lines changed: 9 additions & 0 deletions b/‎agent/config.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎agent/context_manager/manager.py‎
Lines changed: 4 additions & 9 deletions b/‎agent/context_manager/manager.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎agent/core/agent_loop.py‎
Lines changed: 23 additions & 47 deletions b/‎agent/core/agent_loop.py‎
Lines changed: 23 additions & 47 deletions
diff --git a/‎agent/core/hf_router_catalog.py‎
Lines changed: 129 additions & 0 deletions b/‎agent/core/hf_router_catalog.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎agent/core/llm_params.py‎
Lines changed: 76 additions & 0 deletions b/‎agent/core/llm_params.py‎
Lines changed: 76 additions & 0 deletions
@@ -33,6 +33,15 @@ class Config(BaseModel):
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
 
+    # Reasoning effort for models that support it (GPT-5 / o-series, Claude
+    # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
+    # Defaults to "high" — we'd rather spend tokens thinking than ship a
+    # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
+    # "minimal" is an OpenAI-only level and is normalized to "low" for HF
+    # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
+    # Valid values: None | "minimal" | "low" | "medium" | "high"
+    reasoning_effort: str | None = "high"
+
 
 def substitute_env_vars(obj: Any) -> Any:
     """
 
@@ -306,19 +306,14 @@ async def compact(
             )
         )
 
-        hf_key = (
-            os.environ.get("INFERENCE_TOKEN")
-            or hf_token
-            or os.environ.get("HF_TOKEN")
-        )
+        from agent.core.llm_params import _resolve_llm_params
+
+        llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
         response = await acompletion(
-            model=model_name,
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
             tools=tool_specs,
-            api_key=hf_key
-            if hf_key and model_name.startswith("huggingface/")
-            else None,
+            **llm_params,
         )
         summarized_message = Message(
             role="assistant", content=response.choices[0].message.content
 
@@ -13,6 +13,7 @@
 
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
+from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
@@ -22,51 +23,6 @@
 ToolCall = ChatCompletionMessageToolCall
 
 
-def _resolve_hf_router_params(
-    model_name: str, session_hf_token: str | None = None
-) -> dict:
-    """
-    Build LiteLLM kwargs for HuggingFace Router models.
-
-    api-inference.huggingface.co is deprecated; the new router lives at
-    router.huggingface.co/<provider>/v3/openai.  LiteLLM's built-in
-    ``huggingface/`` provider still targets the old endpoint, so we
-    rewrite model names to ``openai/`` and supply the correct api_base.
-
-    Input format:  huggingface/<router_provider>/<org>/<model>
-    Example:       huggingface/novita/moonshotai/kimi-k2.5
-
-    Token resolution (first non-empty wins):
-      1. INFERENCE_TOKEN env — shared key on the hosted Space so inference
-         is free for users and billed to the Space owner.
-      2. session.hf_token — the user's own token (CLI or self-hosted),
-         resolved from env / huggingface-cli login / cached token file.
-      3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
-    """
-    if not model_name.startswith("huggingface/"):
-        return {"model": model_name}
-
-    parts = model_name.split(
-        "/", 2
-    )  # ['huggingface', 'novita', 'moonshotai/kimi-k2.5']
-    if len(parts) < 3:
-        return {"model": model_name}
-
-    router_provider = parts[1]
-    actual_model = parts[2]
-    api_key = (
-        os.environ.get("INFERENCE_TOKEN")
-        or session_hf_token
-        or os.environ.get("HF_TOKEN")
-    )
-
-    return {
-        "model": f"openai/{actual_model}",
-        "api_base": f"https://router.huggingface.co/{router_provider}/v3/openai",
-        "api_key": api_key,
-    }
-
-
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
     Validate tool arguments structure.
@@ -201,6 +157,24 @@ def _friendly_error_message(error: Exception) -> str | None:
             "at your model provider's dashboard."
         )
 
+    if "not supported by provider" in err_str or "no provider supports" in err_str:
+        return (
+            "The model isn't served by the provider you pinned.\n\n"
+            "Drop the ':<provider>' suffix to let the HF router auto-pick a "
+            "provider, or use '/model' (no arg) to see which providers host "
+            "which models."
+        )
+
+    if "model_not_found" in err_str or (
+        "model" in err_str
+        and ("not found" in err_str or "does not exist" in err_str)
+    ):
+        return (
+            "Model not found. Use '/model' to list suggestions, or paste an "
+            "HF model id like 'MiniMaxAI/MiniMax-M2.7'. Availability is shown "
+            "when you switch."
+        )
+
     return None
 
 
@@ -518,8 +492,10 @@ async def run_agent(
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
-                llm_params = _resolve_hf_router_params(
-                    session.config.model_name, session.hf_token
+                llm_params = _resolve_llm_params(
+                    session.config.model_name,
+                    session.hf_token,
+                    reasoning_effort=session.config.reasoning_effort,
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
 
@@ -0,0 +1,129 @@
+"""Fetch and cache the HF Inference Router model catalog.
+
+The router exposes an OpenAI-compatible listing at
+``https://router.huggingface.co/v1/models`` with per-provider availability,
+pricing, context length, and tool-use support. We use it to:
+
+  • Validate ``/model`` switches with live data instead of a hard-coded allowlist.
+  • Show the user which providers serve a model, at what price, and whether they
+    support tool calls.
+  • Derive a reasonable context-window limit for any routed model.
+
+The listing is cached in-memory for a few minutes so repeated lookups during a
+session are free. On fetch failure we return stale data if we have it, or an
+empty catalog otherwise.
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+from difflib import get_close_matches
+from typing import Optional
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_CATALOG_URL = "https://router.huggingface.co/v1/models"
+_CACHE_TTL_SECONDS = 300
+_HTTP_TIMEOUT_SECONDS = 5.0
+
+_cache: Optional[dict] = None
+_cache_time: float = 0.0
+
+
+@dataclass
+class ProviderInfo:
+    provider: str
+    status: str
+    context_length: Optional[int]
+    input_price: Optional[float]
+    output_price: Optional[float]
+    supports_tools: bool
+    supports_structured_output: bool
+
+
+@dataclass
+class ModelInfo:
+    id: str
+    providers: list[ProviderInfo]
+
+    @property
+    def live_providers(self) -> list[ProviderInfo]:
+        return [p for p in self.providers if p.status == "live"]
+
+    @property
+    def max_context_length(self) -> Optional[int]:
+        lengths = [p.context_length for p in self.live_providers if p.context_length]
+        return max(lengths) if lengths else None
+
+    @property
+    def any_supports_tools(self) -> bool:
+        return any(p.supports_tools for p in self.live_providers)
+
+
+def _fetch_catalog(force: bool = False) -> dict:
+    global _cache, _cache_time
+    now = time.time()
+    if not force and _cache is not None and now - _cache_time < _CACHE_TTL_SECONDS:
+        return _cache
+    try:
+        resp = httpx.get(_CATALOG_URL, timeout=_HTTP_TIMEOUT_SECONDS)
+        resp.raise_for_status()
+        _cache = resp.json()
+        _cache_time = now
+    except Exception as e:
+        logger.warning("Failed to fetch HF router catalog: %s", e)
+        if _cache is None:
+            _cache = {"data": []}
+            _cache_time = now
+    return _cache
+
+
+def _parse_entry(entry: dict) -> ModelInfo:
+    providers = []
+    for p in entry.get("providers", []) or []:
+        pricing = p.get("pricing") or {}
+        providers.append(
+            ProviderInfo(
+                provider=p.get("provider", ""),
+                status=p.get("status", ""),
+                context_length=p.get("context_length"),
+                input_price=pricing.get("input"),
+                output_price=pricing.get("output"),
+                supports_tools=bool(p.get("supports_tools", False)),
+                supports_structured_output=bool(p.get("supports_structured_output", False)),
+            )
+        )
+    return ModelInfo(id=entry.get("id", ""), providers=providers)
+
+
+def lookup(model_id: str) -> Optional[ModelInfo]:
+    """Find a model in the router catalog.
+
+    Accepts ``<org>/<model>`` or ``<org>/<model>:<tag>`` — the tag is stripped
+    for lookup. Returns ``None`` if the model isn't listed.
+    """
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    for entry in catalog.get("data", []):
+        if entry.get("id") == bare:
+            return _parse_entry(entry)
+    return None
+
+
+def fuzzy_suggest(model_id: str, limit: int = 3) -> list[str]:
+    """Return the closest model ids from the catalog."""
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    ids = [e.get("id", "") for e in catalog.get("data", []) if e.get("id")]
+    return get_close_matches(bare, ids, n=limit, cutoff=0.4)
+
+
+def prewarm() -> None:
+    """Fetch the catalog so subsequent lookups are instant. Safe to call from
+    a background task — swallows failures."""
+    try:
+        _fetch_catalog(force=False)
+    except Exception:
+        pass
@@ -0,0 +1,76 @@
+"""LiteLLM kwargs resolution for the model ids this agent accepts.
+
+Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
+can import it without pulling in the whole agent loop / tool router and
+creating circular imports.
+"""
+
+import os
+
+
+# HF router reasoning models only accept "low" | "medium" | "high" (e.g.
+# MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
+# also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
+# for HF so the user doesn't get a 400.
+_HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
+
+
+def _resolve_llm_params(
+    model_name: str,
+    session_hf_token: str | None = None,
+    reasoning_effort: str | None = None,
+) -> dict:
+    """
+    Build LiteLLM kwargs for a given model id.
+
+    • ``anthropic/<model>`` / ``openai/<model>`` — passed straight through; the
+      user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
+      up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
+      (GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
+      extended-thinking models accept "low" | "medium" | "high" and LiteLLM
+      translates to the thinking config).
+
+    • Anything else is treated as a HuggingFace router id. We hit the
+      auto-routing OpenAI-compatible endpoint at
+      ``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
+      per-provider HF adapter entirely. The id can be bare or carry an HF
+      routing suffix:
+
+          MiniMaxAI/MiniMax-M2.7              # auto = fastest + failover
+          MiniMaxAI/MiniMax-M2.7:cheapest
+          moonshotai/Kimi-K2.6:novita         # pin a specific provider
+
+      A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
+      is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
+      top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
+
+    Token precedence (first non-empty wins):
+      1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
+         free for users, billed to the Space owner via ``X-HF-Bill-To``).
+      2. session.hf_token — the user's own token (CLI / OAuth / cache file).
+      3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
+    """
+    if model_name.startswith(("anthropic/", "openai/")):
+        params: dict = {"model": model_name}
+        if reasoning_effort:
+            params["reasoning_effort"] = reasoning_effort
+        return params
+
+    hf_model = model_name.removeprefix("huggingface/")
+    api_key = (
+        os.environ.get("INFERENCE_TOKEN")
+        or session_hf_token
+        or os.environ.get("HF_TOKEN")
+    )
+    params = {
+        "model": f"openai/{hf_model}",
+        "api_base": "https://router.huggingface.co/v1",
+        "api_key": api_key,
+    }
+    if os.environ.get("INFERENCE_TOKEN"):
+        params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
+    if reasoning_effort:
+        hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
+        if hf_level in _HF_ALLOWED_EFFORTS:
+            params["extra_body"] = {"reasoning_effort": hf_level}
+    return params