Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c139f8b

Browse files
felipeloabetlen
andauthored
feat: Add endpoints for tokenize, detokenize and count tokens (abetlen#1136)
* Add endpoint to count tokens * Add tokenize and detokenize endpoints * Change response key to tokens for tokenize endpoint * Fix dependency bug * Cleanup * Remove example added by mistake * Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints --------- Co-authored-by: Andrei Betlen <[email protected]>
1 parent 1f3156d commit c139f8b

File tree

2 files changed

+105
-2
lines changed

2 files changed

+105
-2
lines changed

llama_cpp/server/app.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
CreateEmbeddingRequest,
4242
CreateChatCompletionRequest,
4343
ModelList,
44+
TokenizeInputRequest,
45+
TokenizeInputResponse,
46+
TokenizeInputCountResponse,
47+
DetokenizeInputRequest,
48+
DetokenizeInputResponse,
4449
)
4550
from llama_cpp.server.errors import RouteErrorHandler
4651

@@ -196,6 +201,9 @@ async def authenticate(
196201
)
197202

198203

204+
openai_v1_tag = "OpenAI V1"
205+
206+
199207
@router.post(
200208
"/v1/completions",
201209
summary="Completion",
@@ -227,11 +235,13 @@ async def authenticate(
227235
},
228236
}
229237
},
238+
tags=[openai_v1_tag],
230239
)
231240
@router.post(
232241
"/v1/engines/copilot-codex/completions",
233242
include_in_schema=False,
234243
dependencies=[Depends(authenticate)],
244+
tags=[openai_v1_tag],
235245
)
236246
async def create_completion(
237247
request: Request,
@@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
297307

298308

299309
@router.post(
300-
"/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]
310+
"/v1/embeddings",
311+
summary="Embedding",
312+
dependencies=[Depends(authenticate)],
313+
tags=[openai_v1_tag],
301314
)
302315
async def create_embedding(
303316
request: CreateEmbeddingRequest,
@@ -339,6 +352,7 @@ async def create_embedding(
339352
},
340353
}
341354
},
355+
tags=[openai_v1_tag],
342356
)
343357
async def create_chat_completion(
344358
request: Request,
@@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
391405
return iterator_or_completion
392406

393407

394-
@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)])
408+
@router.get(
409+
"/v1/models",
410+
summary="Models",
411+
dependencies=[Depends(authenticate)],
412+
tags=[openai_v1_tag],
413+
)
395414
async def get_models(
396415
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
397416
) -> ModelList:
@@ -407,3 +426,51 @@ async def get_models(
407426
for model_alias in llama_proxy
408427
],
409428
}
429+
430+
431+
extras_tag = "Extras"
432+
433+
434+
@router.post(
435+
"/extras/tokenize",
436+
summary="Tokenize",
437+
dependencies=[Depends(authenticate)],
438+
tags=[extras_tag],
439+
)
440+
async def tokenize(
441+
body: TokenizeInputRequest,
442+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
443+
) -> TokenizeInputResponse:
444+
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
445+
446+
return {"tokens": tokens}
447+
448+
449+
@router.post(
450+
"/extras/tokenize/count",
451+
summary="Tokenize Count",
452+
dependencies=[Depends(authenticate)],
453+
tags=[extras_tag],
454+
)
455+
async def count_query_tokens(
456+
body: TokenizeInputRequest,
457+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
458+
) -> TokenizeInputCountResponse:
459+
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
460+
461+
return {"count": len(tokens)}
462+
463+
464+
@router.post(
465+
"/extras/detokenize",
466+
summary="Detokenize",
467+
dependencies=[Depends(authenticate)],
468+
tags=[extras_tag],
469+
)
470+
async def detokenize(
471+
body: DetokenizeInputRequest,
472+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
473+
) -> DetokenizeInputResponse:
474+
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
475+
476+
return {"text": text}

llama_cpp/server/types.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,39 @@ class ModelData(TypedDict):
264264
class ModelList(TypedDict):
265265
object: Literal["list"]
266266
data: List[ModelData]
267+
268+
269+
class TokenizeInputRequest(BaseModel):
270+
model: Optional[str] = model_field
271+
input: Optional[str] = Field(description="The input to tokenize.")
272+
273+
model_config = {
274+
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
275+
}
276+
277+
278+
class TokenizeInputResponse(BaseModel):
279+
tokens: List[int] = Field(description="A list of tokens.")
280+
281+
model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
282+
283+
284+
class TokenizeInputCountResponse(BaseModel):
285+
count: int = Field(description="The number of tokens in the input.")
286+
287+
model_config = {"json_schema_extra": {"example": {"count": 5}}}
288+
289+
290+
class DetokenizeInputRequest(BaseModel):
291+
model: Optional[str] = model_field
292+
tokens: List[int] = Field(description="A list of toekns to detokenize.")
293+
294+
model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
295+
296+
297+
class DetokenizeInputResponse(BaseModel):
298+
text: str = Field(description="The detokenized text.")
299+
300+
model_config = {
301+
"json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
302+
}

0 commit comments

Comments
 (0)