@@ -166,9 +166,11 @@ async def generate_title(
166166) -> dict :
167167 """Generate a short title for a chat session based on the first user message.
168168
169- Always uses Llama-3.1-8B-Instruct via Cerebras on the HF router. The tab
170- headline renders as plain text, so the model is told to avoid markdown
171- and any stray formatting characters are stripped before returning.
169+ Always uses gpt-oss-120b via Cerebras on the HF router. The tab headline
170+ renders as plain text, so the model is told to avoid markdown and any
171+ stray formatting characters are stripped before returning. gpt-oss is a
172+ reasoning model — reasoning_effort=low keeps the reasoning budget small
173+ so the 60-token output budget isn't consumed before the title is written.
172174 """
173175 api_key = (
174176 os .environ .get ("INFERENCE_TOKEN" )
@@ -179,7 +181,7 @@ async def generate_title(
179181 response = await acompletion (
180182 # Double openai/ prefix: LiteLLM strips the first as its provider
181183 # prefix, leaving the HF model id on the wire for the router.
182- model = "openai/meta-llama/Llama-3.1-8B-Instruct :cerebras" ,
184+ model = "openai/openai/gpt-oss-120b :cerebras" ,
183185 api_base = "https://router.huggingface.co/v1" ,
184186 api_key = api_key ,
185187 messages = [
@@ -195,9 +197,10 @@ async def generate_title(
195197 },
196198 {"role" : "user" , "content" : request .text [:500 ]},
197199 ],
198- max_tokens = 20 ,
200+ max_tokens = 60 ,
199201 temperature = 0.3 ,
200202 timeout = 10 ,
203+ reasoning_effort = "low" ,
201204 )
202205 title = response .choices [0 ].message .content .strip ().strip ('"' ).strip ("'" )
203206 title = title .translate (_TITLE_STRIP_CHARS ).strip ()
0 commit comments