Gate NIM thinking params behind NIM_ENABLE_THINKING env var

Alishahryar1 · Alishahryar1 · commit b75f47b62d71 · 2026-03-27T21:44:36.000-07:00
Mistral models reject chat_template_kwargs, causing 400 errors. Make
thinking params (chat_template_kwargs, reasoning_budget) opt-in via
NIM_ENABLE_THINKING env var (default false) so only models that need it
(kimi, nemotron) receive them.
diff --git a/.env.example b/.env.example
@@ -17,13 +17,18 @@ LLAMACPP_BASE_URL="http://localhost:8080/v1"
 # All Claude model requests are mapped to these models, plain model is fallback
 # Format: provider_type/model/name
 # Valid providers: "nvidia_nim" | "open_router" | "lmstudio" | "llamacpp"
-# model that happens to be loaded.
 MODEL_OPUS="nvidia_nim/z-ai/glm4.7"
 MODEL_SONNET="open_router/arcee-ai/trinity-large-preview:free"
 MODEL_HAIKU="open_router/stepfun/step-3.5-flash:free"
 MODEL="nvidia_nim/z-ai/glm4.7"
 
 
+# NIM Settings
+# Enable chat_template_kwargs + reasoning_budget for thinking models (kimi, nemotron).
+# Leave false for models that don't support it (e.g. Mistral).
+NIM_ENABLE_THINKING=false
+
+
 # Provider config
 PROVIDER_RATE_LIMIT=40
 PROVIDER_RATE_WINDOW=60
@@ -77,4 +82,4 @@ FAST_PREFIX_DETECTION=true
 ENABLE_NETWORK_PROBE_MOCK=true
 ENABLE_TITLE_GENERATION_SKIP=true
 ENABLE_SUGGESTION_MODE_SKIP=true
-ENABLE_FILEPATH_EXTRACTION_MOCK=true
+ENABLE_FILEPATH_EXTRACTION_MOCK=true
diff --git a/README.md b/README.md
@@ -73,6 +73,9 @@ MODEL_OPUS="nvidia_nim/z-ai/glm4.7"
 MODEL_SONNET="nvidia_nim/moonshotai/kimi-k2-thinking"
 MODEL_HAIKU="nvidia_nim/stepfun-ai/step-3.5-flash"
 MODEL="nvidia_nim/z-ai/glm4.7"                     # fallback
+
+# Enable for thinking models (kimi, nemotron). Leave false for others (e.g. Mistral).
+NIM_ENABLE_THINKING=true
 ```
 
 </details>
@@ -437,7 +440,8 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE
 | `MODEL_OPUS`         | Model for Claude Opus requests (falls back to `MODEL`)                | `nvidia_nim/z-ai/glm4.7`                          |
 | `MODEL_SONNET`       | Model for Claude Sonnet requests (falls back to `MODEL`)              | `open_router/arcee-ai/trinity-large-preview:free` |
 | `MODEL_HAIKU`        | Model for Claude Haiku requests (falls back to `MODEL`)               | `open_router/stepfun/step-3.5-flash:free`         |
-| `NVIDIA_NIM_API_KEY` | NVIDIA API key                                                        | required for NIM                                  |
+| `NVIDIA_NIM_API_KEY`    | NVIDIA API key                                                        | required for NIM                                  |
+| `NIM_ENABLE_THINKING`   | Send `chat_template_kwargs` + `reasoning_budget` on NIM requests. Enable for thinking models (kimi, nemotron); leave `false` for others (e.g. Mistral) | `false` |
 | `OPENROUTER_API_KEY` | OpenRouter API key                                                    | required for OpenRouter                           |
 | `LM_STUDIO_BASE_URL` | LM Studio server URL                                                  | `http://localhost:1234/v1`                        |
 | `LLAMACPP_BASE_URL`  | llama.cpp server URL                                                  | `http://localhost:8080/v1`                        |
diff --git a/config/nim.py b/config/nim.py
@@ -21,6 +21,7 @@ class NimSettings(BaseModel):
 
     parallel_tool_calls: bool = True
     ignore_eos: bool = False
+    enable_thinking: bool = False
 
     min_tokens: int = Field(0, ge=0)
     chat_template: str | None = None
diff --git a/config/settings.py b/config/settings.py
@@ -90,6 +90,9 @@ class Settings(BaseSettings):
 
     # ==================== NIM Settings ====================
     nim: NimSettings = Field(default_factory=NimSettings)
+    nim_enable_thinking: bool = Field(
+        default=False, validation_alias="NIM_ENABLE_THINKING"
+    )
 
     # ==================== Voice Note Transcription ====================
     voice_note_enabled: bool = Field(
@@ -171,6 +174,13 @@ def validate_model_format(cls, v: str | None) -> str | None:
             )
         return v
 
+    @model_validator(mode="after")
+    def _inject_nim_thinking(self) -> Settings:
+        self.nim = self.nim.model_copy(
+            update={"enable_thinking": self.nim_enable_thinking}
+        )
+        return self
+
     @model_validator(mode="after")
     def check_nvidia_nim_api_key(self) -> Settings:
         if (
diff --git a/providers/nvidia_nim/request.py b/providers/nvidia_nim/request.py
@@ -63,10 +63,11 @@ def build_request_body(request_data: Any, nim: NimSettings) -> dict:
     if request_extra:
         extra_body.update(request_extra)
 
-    extra_body.setdefault(
-        "chat_template_kwargs", {"thinking": True, "enable_thinking": True}
-    )
-    _set_extra(extra_body, "reasoning_budget", max_tokens)
+    if nim.enable_thinking:
+        extra_body.setdefault(
+            "chat_template_kwargs", {"thinking": True, "enable_thinking": True}
+        )
+        _set_extra(extra_body, "reasoning_budget", max_tokens)
 
     req_top_k = getattr(request_data, "top_k", None)
     top_k = req_top_k if req_top_k is not None else nim.top_k
diff --git a/tests/providers/test_nvidia_nim_request.py b/tests/providers/test_nvidia_nim_request.py
@@ -98,7 +98,7 @@ def test_reasoning_params_in_extra_body(self):
         req.extra_body = None
         req.top_k = None
 
-        nim = NimSettings()
+        nim = NimSettings(enable_thinking=True)
         body = build_request_body(req, nim)
         extra = body["extra_body"]
         assert extra["chat_template_kwargs"] == {
@@ -107,6 +107,26 @@ def test_reasoning_params_in_extra_body(self):
         }
         assert extra["reasoning_budget"] == body["max_tokens"]
 
+    def test_no_chat_template_kwargs_when_thinking_disabled(self):
+        req = MagicMock()
+        req.model = "test"
+        req.messages = [MagicMock(role="user", content="hi")]
+        req.max_tokens = 100
+        req.system = None
+        req.temperature = None
+        req.top_p = None
+        req.stop_sequences = None
+        req.tools = None
+        req.tool_choice = None
+        req.extra_body = None
+        req.top_k = None
+
+        nim = NimSettings(enable_thinking=False)
+        body = build_request_body(req, nim)
+        extra = body.get("extra_body", {})
+        assert "chat_template_kwargs" not in extra
+        assert "reasoning_budget" not in extra
+
     def test_no_reasoning_params_in_extra_body(self):
         req = MagicMock()
         req.model = "test"