41
41
CreateEmbeddingRequest ,
42
42
CreateChatCompletionRequest ,
43
43
ModelList ,
44
+ TokenizeInputRequest ,
45
+ TokenizeInputResponse ,
46
+ TokenizeInputCountResponse ,
47
+ DetokenizeInputRequest ,
48
+ DetokenizeInputResponse ,
44
49
)
45
50
from llama_cpp .server .errors import RouteErrorHandler
46
51
@@ -196,6 +201,9 @@ async def authenticate(
196
201
)
197
202
198
203
204
+ openai_v1_tag = "OpenAI V1"
205
+
206
+
199
207
@router .post (
200
208
"/v1/completions" ,
201
209
summary = "Completion" ,
@@ -227,11 +235,13 @@ async def authenticate(
227
235
},
228
236
}
229
237
},
238
+ tags = [openai_v1_tag ],
230
239
)
231
240
@router .post (
232
241
"/v1/engines/copilot-codex/completions" ,
233
242
include_in_schema = False ,
234
243
dependencies = [Depends (authenticate )],
244
+ tags = [openai_v1_tag ],
235
245
)
236
246
async def create_completion (
237
247
request : Request ,
@@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
297
307
298
308
299
309
@router .post (
300
- "/v1/embeddings" , summary = "Embedding" , dependencies = [Depends (authenticate )]
310
+ "/v1/embeddings" ,
311
+ summary = "Embedding" ,
312
+ dependencies = [Depends (authenticate )],
313
+ tags = [openai_v1_tag ],
301
314
)
302
315
async def create_embedding (
303
316
request : CreateEmbeddingRequest ,
@@ -339,6 +352,7 @@ async def create_embedding(
339
352
},
340
353
}
341
354
},
355
+ tags = [openai_v1_tag ],
342
356
)
343
357
async def create_chat_completion (
344
358
request : Request ,
@@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
391
405
return iterator_or_completion
392
406
393
407
394
- @router .get ("/v1/models" , summary = "Models" , dependencies = [Depends (authenticate )])
408
+ @router .get (
409
+ "/v1/models" ,
410
+ summary = "Models" ,
411
+ dependencies = [Depends (authenticate )],
412
+ tags = [openai_v1_tag ],
413
+ )
395
414
async def get_models (
396
415
llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
397
416
) -> ModelList :
@@ -407,3 +426,51 @@ async def get_models(
407
426
for model_alias in llama_proxy
408
427
],
409
428
}
429
+
430
+
431
+ extras_tag = "Extras"
432
+
433
+
434
+ @router .post (
435
+ "/extras/tokenize" ,
436
+ summary = "Tokenize" ,
437
+ dependencies = [Depends (authenticate )],
438
+ tags = [extras_tag ],
439
+ )
440
+ async def tokenize (
441
+ body : TokenizeInputRequest ,
442
+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
443
+ ) -> TokenizeInputResponse :
444
+ tokens = llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
445
+
446
+ return {"tokens" : tokens }
447
+
448
+
449
+ @router .post (
450
+ "/extras/tokenize/count" ,
451
+ summary = "Tokenize Count" ,
452
+ dependencies = [Depends (authenticate )],
453
+ tags = [extras_tag ],
454
+ )
455
+ async def count_query_tokens (
456
+ body : TokenizeInputRequest ,
457
+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
458
+ ) -> TokenizeInputCountResponse :
459
+ tokens = llama_proxy (body .model ).tokenize (body .input .encode ("utf-8" ), special = True )
460
+
461
+ return {"count" : len (tokens )}
462
+
463
+
464
+ @router .post (
465
+ "/extras/detokenize" ,
466
+ summary = "Detokenize" ,
467
+ dependencies = [Depends (authenticate )],
468
+ tags = [extras_tag ],
469
+ )
470
+ async def detokenize (
471
+ body : DetokenizeInputRequest ,
472
+ llama_proxy : LlamaProxy = Depends (get_llama_proxy ),
473
+ ) -> DetokenizeInputResponse :
474
+ text = llama_proxy (body .model ).detokenize (body .tokens ).decode ("utf-8" )
475
+
476
+ return {"text" : text }
0 commit comments