Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 182ddee

Browse files
akseljoonasclaude
andauthored
Fall back to user HF token for router when INFERENCE_TOKEN is unset (#39)
The HF router code path only read INFERENCE_TOKEN, which is the shared server-side key set on the hosted Space so inference is free for users. On the CLI / self-hosted path that env var is absent, so requests went out with no bearer token and the router returned 401 — surfaced to users as "Authentication failed" even with a valid HF_TOKEN (issue #36). Resolve api_key in this order: 1. INFERENCE_TOKEN env (unchanged Space behavior — shared billing) 2. session.hf_token (user's OAuth / CLI token) 3. HF_TOKEN env (belt-and-suspenders for CLI) Applied to _resolve_hf_router_params, research_tool._resolve_llm_params, and ContextManager.compact. Fixes #36 Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
1 parent d0a9a6f commit 182ddee

3 files changed

Lines changed: 39 additions & 10 deletions

File tree

‎agent/context_manager/manager.py‎

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,10 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
263263
return False
264264

265265
async def compact(
266-
self, model_name: str, tool_specs: list[dict] | None = None
266+
self,
267+
model_name: str,
268+
tool_specs: list[dict] | None = None,
269+
hf_token: str | None = None,
267270
) -> None:
268271
"""Remove old messages to keep history under target size"""
269272
if (self.context_length <= self.max_context) or not self.items:
@@ -303,7 +306,11 @@ async def compact(
303306
)
304307
)
305308

306-
hf_key = os.environ.get("INFERENCE_TOKEN")
309+
hf_key = (
310+
os.environ.get("INFERENCE_TOKEN")
311+
or hf_token
312+
or os.environ.get("HF_TOKEN")
313+
)
307314
response = await acompletion(
308315
model=model_name,
309316
messages=messages_to_summarize,

‎agent/core/agent_loop.py‎

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@
2020
logger = logging.getLogger(__name__)
2121

2222
ToolCall = ChatCompletionMessageToolCall
23-
# Explicit inference token for LLM API calls (separate from user OAuth tokens).
24-
_INFERENCE_API_KEY = os.environ.get("INFERENCE_TOKEN")
2523

2624

27-
def _resolve_hf_router_params(model_name: str) -> dict:
25+
def _resolve_hf_router_params(
26+
model_name: str, session_hf_token: str | None = None
27+
) -> dict:
2828
"""
2929
Build LiteLLM kwargs for HuggingFace Router models.
3030
@@ -35,6 +35,13 @@ def _resolve_hf_router_params(model_name: str) -> dict:
3535
3636
Input format: huggingface/<router_provider>/<org>/<model>
3737
Example: huggingface/novita/moonshotai/kimi-k2.5
38+
39+
Token resolution (first non-empty wins):
40+
1. INFERENCE_TOKEN env — shared key on the hosted Space so inference
41+
is free for users and billed to the Space owner.
42+
2. session.hf_token — the user's own token (CLI or self-hosted),
43+
resolved from env / huggingface-cli login / cached token file.
44+
3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
3845
"""
3946
if not model_name.startswith("huggingface/"):
4047
return {"model": model_name}
@@ -47,7 +54,11 @@ def _resolve_hf_router_params(model_name: str) -> dict:
4754

4855
router_provider = parts[1]
4956
actual_model = parts[2]
50-
api_key = _INFERENCE_API_KEY
57+
api_key = (
58+
os.environ.get("INFERENCE_TOKEN")
59+
or session_hf_token
60+
or os.environ.get("HF_TOKEN")
61+
)
5162

5263
return {
5364
"model": f"openai/{actual_model}",
@@ -205,6 +216,7 @@ async def _compact_and_notify(session: Session) -> None:
205216
await session.context_manager.compact(
206217
model_name=session.config.model_name,
207218
tool_specs=tool_specs,
219+
hf_token=session.hf_token,
208220
)
209221
new_length = session.context_manager.context_length
210222
if new_length != old_length:
@@ -506,7 +518,9 @@ async def run_agent(
506518
tools = session.tool_router.get_tool_specs_for_llm()
507519
try:
508520
# ── Call the LLM (streaming or non-streaming) ──
509-
llm_params = _resolve_hf_router_params(session.config.model_name)
521+
llm_params = _resolve_hf_router_params(
522+
session.config.model_name, session.hf_token
523+
)
510524
if session.stream:
511525
llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
512526
else:

‎agent/tools/research_tool.py‎

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,9 @@
213213
}
214214

215215

216-
def _resolve_llm_params(model_name: str) -> dict:
216+
def _resolve_llm_params(
217+
model_name: str, session_hf_token: str | None = None
218+
) -> dict:
217219
"""Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
218220
if not model_name.startswith("huggingface/"):
219221
return {"model": model_name}
@@ -224,10 +226,16 @@ def _resolve_llm_params(model_name: str) -> dict:
224226

225227
provider = parts[1]
226228
model_id = parts[2]
229+
api_key = (
230+
os.environ.get("INFERENCE_TOKEN")
231+
or session_hf_token
232+
or os.environ.get("HF_TOKEN")
233+
or ""
234+
)
227235
return {
228236
"model": f"openai/{model_id}",
229237
"api_base": f"https://router.huggingface.co/{provider}/v3/openai",
230-
"api_key": os.environ.get("INFERENCE_TOKEN", ""),
238+
"api_key": api_key,
231239
}
232240

233241

@@ -264,7 +272,7 @@ async def research_handler(
264272
# Use a cheaper/faster model for research
265273
main_model = session.config.model_name
266274
research_model = _get_research_model(main_model)
267-
llm_params = _resolve_llm_params(research_model)
275+
llm_params = _resolve_llm_params(research_model, getattr(session, "hf_token", None))
268276

269277
# Get read-only tool specs from the session's tool router
270278
tool_specs = [

0 commit comments

Comments
 (0)