Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b75f47b

Browse files
committed
Gate NIM thinking params behind NIM_ENABLE_THINKING env var
Mistral models reject chat_template_kwargs, causing 400 errors. Make thinking params (chat_template_kwargs, reasoning_budget) opt-in via NIM_ENABLE_THINKING env var (default false) so only models that need it (kimi, nemotron) receive them.
1 parent ab0d6ac commit b75f47b

6 files changed

Lines changed: 49 additions & 8 deletions

File tree

.env.example

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@ LLAMACPP_BASE_URL="http://localhost:8080/v1"
1717
# All Claude model requests are mapped to these models, plain model is fallback
1818
# Format: provider_type/model/name
1919
# Valid providers: "nvidia_nim" | "open_router" | "lmstudio" | "llamacpp"
20-
# model that happens to be loaded.
2120
MODEL_OPUS="nvidia_nim/z-ai/glm4.7"
2221
MODEL_SONNET="open_router/arcee-ai/trinity-large-preview:free"
2322
MODEL_HAIKU="open_router/stepfun/step-3.5-flash:free"
2423
MODEL="nvidia_nim/z-ai/glm4.7"
2524

2625

26+
# NIM Settings
27+
# Enable chat_template_kwargs + reasoning_budget for thinking models (kimi, nemotron).
28+
# Leave false for models that don't support it (e.g. Mistral).
29+
NIM_ENABLE_THINKING=false
30+
31+
2732
# Provider config
2833
PROVIDER_RATE_LIMIT=40
2934
PROVIDER_RATE_WINDOW=60
@@ -77,4 +82,4 @@ FAST_PREFIX_DETECTION=true
7782
ENABLE_NETWORK_PROBE_MOCK=true
7883
ENABLE_TITLE_GENERATION_SKIP=true
7984
ENABLE_SUGGESTION_MODE_SKIP=true
80-
ENABLE_FILEPATH_EXTRACTION_MOCK=true
85+
ENABLE_FILEPATH_EXTRACTION_MOCK=true

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ MODEL_OPUS="nvidia_nim/z-ai/glm4.7"
7373
MODEL_SONNET="nvidia_nim/moonshotai/kimi-k2-thinking"
7474
MODEL_HAIKU="nvidia_nim/stepfun-ai/step-3.5-flash"
7575
MODEL="nvidia_nim/z-ai/glm4.7" # fallback
76+
77+
# Enable for thinking models (kimi, nemotron). Leave false for others (e.g. Mistral).
78+
NIM_ENABLE_THINKING=true
7679
```
7780

7881
</details>
@@ -437,7 +440,8 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE
437440
| `MODEL_OPUS` | Model for Claude Opus requests (falls back to `MODEL`) | `nvidia_nim/z-ai/glm4.7` |
438441
| `MODEL_SONNET` | Model for Claude Sonnet requests (falls back to `MODEL`) | `open_router/arcee-ai/trinity-large-preview:free` |
439442
| `MODEL_HAIKU` | Model for Claude Haiku requests (falls back to `MODEL`) | `open_router/stepfun/step-3.5-flash:free` |
440-
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
443+
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
444+
| `NIM_ENABLE_THINKING` | Send `chat_template_kwargs` + `reasoning_budget` on NIM requests. Enable for thinking models (kimi, nemotron); leave `false` for others (e.g. Mistral) | `false` |
441445
| `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter |
442446
| `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` |
443447
| `LLAMACPP_BASE_URL` | llama.cpp server URL | `http://localhost:8080/v1` |

config/nim.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class NimSettings(BaseModel):
2121

2222
parallel_tool_calls: bool = True
2323
ignore_eos: bool = False
24+
enable_thinking: bool = False
2425

2526
min_tokens: int = Field(0, ge=0)
2627
chat_template: str | None = None

config/settings.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ class Settings(BaseSettings):
9090

9191
# ==================== NIM Settings ====================
9292
nim: NimSettings = Field(default_factory=NimSettings)
93+
nim_enable_thinking: bool = Field(
94+
default=False, validation_alias="NIM_ENABLE_THINKING"
95+
)
9396

9497
# ==================== Voice Note Transcription ====================
9598
voice_note_enabled: bool = Field(
@@ -171,6 +174,13 @@ def validate_model_format(cls, v: str | None) -> str | None:
171174
)
172175
return v
173176

177+
@model_validator(mode="after")
178+
def _inject_nim_thinking(self) -> Settings:
179+
self.nim = self.nim.model_copy(
180+
update={"enable_thinking": self.nim_enable_thinking}
181+
)
182+
return self
183+
174184
@model_validator(mode="after")
175185
def check_nvidia_nim_api_key(self) -> Settings:
176186
if (

providers/nvidia_nim/request.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def build_request_body(request_data: Any, nim: NimSettings) -> dict:
6363
if request_extra:
6464
extra_body.update(request_extra)
6565

66-
extra_body.setdefault(
67-
"chat_template_kwargs", {"thinking": True, "enable_thinking": True}
68-
)
69-
_set_extra(extra_body, "reasoning_budget", max_tokens)
66+
if nim.enable_thinking:
67+
extra_body.setdefault(
68+
"chat_template_kwargs", {"thinking": True, "enable_thinking": True}
69+
)
70+
_set_extra(extra_body, "reasoning_budget", max_tokens)
7071

7172
req_top_k = getattr(request_data, "top_k", None)
7273
top_k = req_top_k if req_top_k is not None else nim.top_k

tests/providers/test_nvidia_nim_request.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_reasoning_params_in_extra_body(self):
9898
req.extra_body = None
9999
req.top_k = None
100100

101-
nim = NimSettings()
101+
nim = NimSettings(enable_thinking=True)
102102
body = build_request_body(req, nim)
103103
extra = body["extra_body"]
104104
assert extra["chat_template_kwargs"] == {
@@ -107,6 +107,26 @@ def test_reasoning_params_in_extra_body(self):
107107
}
108108
assert extra["reasoning_budget"] == body["max_tokens"]
109109

110+
def test_no_chat_template_kwargs_when_thinking_disabled(self):
111+
req = MagicMock()
112+
req.model = "test"
113+
req.messages = [MagicMock(role="user", content="hi")]
114+
req.max_tokens = 100
115+
req.system = None
116+
req.temperature = None
117+
req.top_p = None
118+
req.stop_sequences = None
119+
req.tools = None
120+
req.tool_choice = None
121+
req.extra_body = None
122+
req.top_k = None
123+
124+
nim = NimSettings(enable_thinking=False)
125+
body = build_request_body(req, nim)
126+
extra = body.get("extra_body", {})
127+
assert "chat_template_kwargs" not in extra
128+
assert "reasoning_budget" not in extra
129+
110130
def test_no_reasoning_params_in_extra_body(self):
111131
req = MagicMock()
112132
req.model = "test"

0 commit comments

Comments
 (0)