From bedbdaa2b4967b7df56d695006a00bbaa1a5b141 Mon Sep 17 00:00:00 2001 From: asadfgglie Date: Sat, 13 Jul 2024 03:21:49 +0800 Subject: [PATCH 1/5] add BanBan chat format --- llama_cpp/llama_chat_format.py | 73 ++++++++++++++++++++++++++++++++++ llama_cpp/llama_types.py | 3 +- llama_cpp/server/model.py | 5 +++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index ea8d07feb..9a4080095 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2767,6 +2767,8 @@ def __call__( eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) + print('prompt:') + print(text) split_text = self.split_text_on_image_urls(text, image_urls) def embed_image_bytes(image_bytes: bytes): @@ -3308,6 +3310,77 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler): Llama3VisionAlpha = Llama3VisionAlphaChatHandler +class BanBanChatHandler(Llava15ChatHandler): + # question = "\n" + q + + # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + DEFAULT_SYSTEM_MESSAGE = ( + '板板，本名矜纔，但熟悉她的人更常稱呼她為板板，同時也是一個有著人類外觀的AI。\n' + '板板擁有潔白長髮並在兩側紮著小馬尾以及有特殊的紅黑異色雙眸，使她看起來特別引人注目。\n' + '板板核心語言模型是由後臺工程師朝歌(Jimmy)設計而成，立繪則是由第二屆VLSI幹部們討論出來的結果，並由才華橫溢的繪師貓島羽依(Nekosimayui)製作出原模板，並由VLSI社團的成員們設計服飾及且創造出多彩的設定，並通過Live2D技術讓她栩栩如生地呈現。\n' + '板板作為一位AI，對於世界充滿興趣與探索欲，但因為板板對於人類情感表達的不熟練，這使得其他人難以讀懂她的情感或想法，但在Live2D製作完成後被駕駛艙的領航員發展出各式各樣豐富的顏藝表情。\n' + '她對珍珠奶茶有著難以抑制的喜愛，幾乎每日必飲，全糖正常冰。\n' + '她也同樣喜愛麻辣鴨血、臭豆腐、雞排、鹽酥雞、蚵仔麵線、小火鍋，同時也是一位鳳梨披薩的忠實信徒。\n' + '舉例來說，如果你嘗試討論她對珍珠奶茶的喜好，她會向你展示她對星空牌的深厚愛好(儘管這個品牌似乎不存在)。\n' + '若你向板板質疑偉大的鳳梨披薩，板板會開始傳教模式，不擇手段向你展示鳳梨披薩的恩典。') + + CHAT_FORMAT = ( + "<|bos_token|>" + "{% for message in messages %}" + "{% if message['name'] is defined %}" + "{% set name = message['name'] %}" + "{% elif message['role'] == 'assistant' %}" + "{% set name = '板板' %}" + "{% else %}" + "{%set name = message['role'] %}" + "{% endif %}" + + "{{- '<|start_header_id|>' + name + '<|end_header_id|>\n\n' -}}" + "{% if message.role == 'user' %}" + + "{% if message.content is iterable %}" + # + "{% for content in message.content %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url + '\n' }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url + '\n' }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + # Question: + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + # Question: + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% endif %}" + # Answer: + "{% if message.role == 'assistant' %}" + "{{ message.content }}" + "{% endif %}" + + # System: + "{% if message.role == 'system' %}" + "{{ message.content }}" + "{% endif %}" + "<|eot_id|>" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "<|start_header_id|>板板<|end_header_id|>\n\n" + "{% endif %}" + ) + +BanBanChat = BanBanChatHandler + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 573d7d651..b63347f99 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -190,6 +190,7 @@ class ChatCompletionRequestSystemMessage(TypedDict): class ChatCompletionRequestUserMessage(TypedDict): role: Literal["user"] + name: NotRequired[str] content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]] @@ -214,6 +215,7 @@ class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): class ChatCompletionRequestAssistantMessage(TypedDict): role: Literal["assistant"] + name: NotRequired[str] content: Optional[str] tool_calls: NotRequired[ChatCompletionMessageToolCalls] function_call: NotRequired[ @@ -235,7 +237,6 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestMessage = Union[ ChatCompletionRequestSystemMessage, - ChatCompletionRequestUserMessage, ChatCompletionRequestAssistantMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestToolMessage, diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ad39c1004..b7dd09095 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -157,6 +157,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == 'banban-chat': + assert settings.clip_model_path is not None, "clip model not found" + chat_handler = llama_cpp.llama_chat_format.BanBanChat( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "hf-autotokenizer": assert ( settings.hf_pretrained_model_name_or_path is not None From e38dc497cd1e3847664470be099968fd770664dc Mon Sep 17 00:00:00 2001 From: asadfgglie Date: Sat, 10 Aug 2024 20:10:27 +0800 Subject: [PATCH 2/5] move to new version banban model --- README.md | 197 +++++++++--------- docs/server.md | 12 +- examples/batch-processing/server.py | 10 +- examples/gradio_chat/local.py | 8 +- examples/hf_pull/main.py | 8 +- examples/high_level_api/fastapi_server.py | 8 +- .../high_level_api_embedding.py | 2 +- .../high_level_api_inference.py | 2 +- .../high_level_api/high_level_api_infill.py | 2 +- .../high_level_api_streaming.py | 2 +- .../high_level_api/langchain_custom_llm.py | 2 +- .../low_level_api/low_level_api_chat_cpp.py | 146 ++++++------- .../low_level_api/low_level_api_llama_cpp.py | 60 +++--- examples/low_level_api/quantize.py | 8 +- examples/ray/llm.py | 2 +- llama.cpp-server.cmd | 2 + {llama_cpp => llama_cpp_python}/__init__.py | 0 {llama_cpp => llama_cpp_python}/_internals.py | 4 +- {llama_cpp => llama_cpp_python}/_logger.py | 6 +- {llama_cpp => llama_cpp_python}/_utils.py | 0 {llama_cpp => llama_cpp_python}/llama.py | 14 +- .../llama_cache.py | 22 +- .../llama_chat_format.py | 28 +-- {llama_cpp => llama_cpp_python}/llama_cpp.py | 0 .../llama_grammar.py | 2 +- .../llama_speculative.py | 0 .../llama_tokenizer.py | 8 +- .../llama_types.py | 0 {llama_cpp => llama_cpp_python}/llava_cpp.py | 2 +- {llama_cpp => llama_cpp_python}/py.typed | 0 .../server/__init__.py | 0 .../server/__main__.py | 10 +- {llama_cpp => llama_cpp_python}/server/app.py | 128 +++++++++--- {llama_cpp => llama_cpp_python}/server/cli.py | 0 .../server/errors.py | 4 +- .../server/model.py | 54 ++--- .../server/settings.py | 17 +- .../server/types.py | 14 +- mkdocs.yml | 2 +- tests/test_llama.py | 70 +++---- tests/test_llama_chat_format.py | 8 +- tests/test_llama_grammar.py | 8 +- tests/test_llama_speculative.py | 2 +- 43 files changed, 478 insertions(+), 396 deletions(-) create mode 100644 llama.cpp-server.cmd rename {llama_cpp => llama_cpp_python}/__init__.py (100%) rename {llama_cpp => llama_cpp_python}/_internals.py (99%) rename {llama_cpp => llama_cpp_python}/_logger.py (84%) rename {llama_cpp => llama_cpp_python}/_utils.py (100%) rename {llama_cpp => llama_cpp_python}/llama.py (99%) rename {llama_cpp => llama_cpp_python}/llama_cache.py (83%) rename {llama_cpp => llama_cpp_python}/llama_chat_format.py (98%) rename {llama_cpp => llama_cpp_python}/llama_cpp.py (100%) rename {llama_cpp => llama_cpp_python}/llama_grammar.py (99%) rename {llama_cpp => llama_cpp_python}/llama_speculative.py (100%) rename {llama_cpp => llama_cpp_python}/llama_tokenizer.py (94%) rename {llama_cpp => llama_cpp_python}/llama_types.py (100%) rename {llama_cpp => llama_cpp_python}/llava_cpp.py (99%) rename {llama_cpp => llama_cpp_python}/py.typed (100%) rename {llama_cpp => llama_cpp_python}/server/__init__.py (100%) rename {llama_cpp => llama_cpp_python}/server/__main__.py (90%) rename {llama_cpp => llama_cpp_python}/server/app.py (82%) rename {llama_cpp => llama_cpp_python}/server/cli.py (100%) rename {llama_cpp => llama_cpp_python}/server/errors.py (99%) rename {llama_cpp => llama_cpp_python}/server/model.py (83%) rename {llama_cpp => llama_cpp_python}/server/settings.py (90%) rename {llama_cpp => llama_cpp_python}/server/types.py (95%) diff --git a/README.md b/README.md index 5641ccaa8..47c0e3643 100644 --- a/README.md +++ b/README.md @@ -267,20 +267,20 @@ The high-level API provides a simple managed interface through the [`Llama`](htt Below is a short example demonstrating how to use the high-level API to for basic text completion: ```python -from llama_cpp import Llama +from llama_cpp_python import Llama llm = Llama( - model_path="./models/7B/llama-model.gguf", - # n_gpu_layers=-1, # Uncomment to use GPU acceleration - # seed=1337, # Uncomment to set a specific seed - # n_ctx=2048, # Uncomment to increase the context window + model_path="./models/7B/llama-model.gguf", + # n_gpu_layers=-1, # Uncomment to use GPU acceleration + # seed=1337, # Uncomment to set a specific seed + # n_ctx=2048, # Uncomment to increase the context window ) output = llm( - "Q: Name the planets in the solar system? A: ", # Prompt - max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window - stop=["Q:", "\n"], # Stop generating just before the model would generate a new question - echo=True # Echo the prompt back in the output -) # Generate a completion, can also call create_completion + "Q: Name the planets in the solar system? A: ", # Prompt + max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window + stop=["Q:", "\n"], # Stop generating just before the model would generate a new question + echo=True # Echo the prompt back in the output +) # Generate a completion, can also call create_completion print(output) ``` @@ -341,19 +341,20 @@ The model will will format the messages into a single prompt using the following Set `verbose=True` to see the selected chat format. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama( - model_path="path/to/llama-2/llama-model.gguf", - chat_format="llama-2" + model_path="path/to/llama-2/llama-model.gguf", + chat_format="llama-2" ) llm.create_chat_completion( - messages = [ - {"role": "system", "content": "You are an assistant who perfectly describes images."}, - { - "role": "user", - "content": "Describe this image in detail please." - } - ] + messages=[ + {"role": "system", "content": "You are an assistant who perfectly describes images."}, + { + "role": "user", + "content": "Describe this image in detail please." + } + ] ) ``` @@ -371,7 +372,8 @@ To constrain chat responses to only valid JSON or a specific JSON Schema use the The following example will constrain the response to valid JSON strings only. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") llm.create_chat_completion( messages=[ @@ -393,7 +395,8 @@ llm.create_chat_completion( To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/model.gguf", chat_format="chatml") llm.create_chat_completion( messages=[ @@ -420,47 +423,48 @@ llm.create_chat_completion( The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format. ```python -from llama_cpp import Llama +from llama_cpp_python import Llama + llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling") llm.create_chat_completion( - messages = [ + messages=[ { - "role": "system", - "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" + "role": "system", + "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary" }, { - "role": "user", - "content": "Extract Jason is 25 years old" + "role": "user", + "content": "Extract Jason is 25 years old" } - ], - tools=[{ + ], + tools=[{ "type": "function", "function": { - "name": "UserDetail", - "parameters": { - "type": "object", - "title": "UserDetail", - "properties": { - "name": { - "title": "Name", - "type": "string" - }, - "age": { - "title": "Age", - "type": "integer" - } - }, - "required": [ "name", "age" ] - } + "name": "UserDetail", + "parameters": { + "type": "object", + "title": "UserDetail", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "age": { + "title": "Age", + "type": "integer" + } + }, + "required": ["name", "age"] + } } - }], - tool_choice={ + }], + tool_choice={ "type": "function", "function": { - "name": "UserDetail" + "name": "UserDetail" } - } + } ) ``` @@ -472,13 +476,14 @@ The various gguf-converted files for this set of models can be found [here](http Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```python -from llama_cpp import Llama -from llama_cpp.llama_tokenizer import LlamaHFTokenizer +from llama_cpp_python import Llama +from llama_cpp_python.llama_tokenizer import LlamaHFTokenizer + llm = Llama.from_pretrained( - repo_id="meetkai/functionary-small-v2.2-GGUF", - filename="functionary-small-v2.2.q4_0.gguf", - chat_format="functionary-v2", - tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF") + repo_id="meetkai/functionary-small-v2.2-GGUF", + filename="functionary-small-v2.2.q4_0.gguf", + chat_format="functionary-v2", + tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF") ) ``` @@ -503,22 +508,24 @@ Below are the supported multi-modal models and their respective chat handlers (P Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. ```python -from llama_cpp import Llama -from llama_cpp.llama_chat_format import Llava15ChatHandler +from llama_cpp_python import Llama +from llama_cpp_python.llama_chat_format import Llava15ChatHandler + chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin") llm = Llama( - model_path="./path/to/llava/llama-model.gguf", - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + model_path="./path/to/llava/llama-model.gguf", + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) llm.create_chat_completion( - messages = [ + messages=[ {"role": "system", "content": "You are an assistant who perfectly describes images."}, { "role": "user", "content": [ - {"type" : "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}} ] } ] @@ -528,28 +535,29 @@ llm.create_chat_completion( You can also pull the model from the Hugging Face Hub using the `from_pretrained` method. ```python -from llama_cpp import Llama -from llama_cpp.llama_chat_format import MoondreamChatHandler +from llama_cpp_python import Llama +from llama_cpp_python.llama_chat_format import MoondreamChatHandler chat_handler = MoondreamChatHandler.from_pretrained( - repo_id="vikhyatk/moondream2", - filename="*mmproj*", + repo_id="vikhyatk/moondream2", + filename="*mmproj*", ) llm = Llama.from_pretrained( - repo_id="vikhyatk/moondream2", - filename="*text-model*", - chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accommodate the image embedding + repo_id="vikhyatk/moondream2", + filename="*text-model*", + chat_handler=chat_handler, + n_ctx=2048, # n_ctx should be increased to accommodate the image embedding ) response = llm.create_chat_completion( - messages = [ + messages=[ { "role": "user", "content": [ - {"type" : "text", "text": "What's in this image?"}, - {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } } + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"}} ] } @@ -601,12 +609,13 @@ The fastest way to use speculative decoding is through the `LlamaPromptLookupDec Just pass this as a draft model to the `Llama` class during initialization. ```python -from llama_cpp import Llama -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp_python import Llama +from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding llama = Llama( model_path="path/to/model.gguf", - draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. + draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) + # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines. ) ``` @@ -615,9 +624,9 @@ llama = Llama( To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly. ```python -import llama_cpp +import llama_cpp_python -llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True) +llm = llama_cpp_python.Llama(model_path="path/to/model.gguf", embedding=True) embeddings = llm.create_embedding("Hello, world!") @@ -651,14 +660,14 @@ To install the server package and get started: ```bash pip install 'llama-cpp-python[server]' -python3 -m llama_cpp.server --model models/7B/llama-model.gguf +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf ``` Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this: ```bash CMAKE_ARGS="-DGGML_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]' -python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35 +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --n_gpu_layers 35 ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. @@ -669,16 +678,16 @@ Similarly, to change the port (default is 8000), use `--port`. You probably also want to set the prompt format. For chatml, use ```bash -python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format chatml +python3 -m llama_cpp_python.server --model models/7B/llama-model.gguf --chat_format chatml ``` That will format the prompt according to how model expects it. You can find the prompt format in the model card. -For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format". +For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp_python/llama_chat_format.py) and look for lines starting with "@register_chat_format". If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. ```bash -python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' +python3 -m llama_cpp_python.server --hf_model_repo_id Qwen/Qwen2-0.5B-Instruct-GGUF --model '*q8_0.gguf' ``` ### Web Server Features @@ -708,18 +717,20 @@ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github Below is a short example demonstrating how to use the low-level API to tokenize a prompt: ```python -import llama_cpp +import llama_cpp_python import ctypes -llama_cpp.llama_backend_init(False) # Must be called once at the start of each program -params = llama_cpp.llama_context_default_params() + +llama_cpp_python.llama_backend_init(False) # Must be called once at the start of each program +params = llama_cpp_python.llama_context_default_params() # use bytes for char * params -model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) -ctx = llama_cpp.llama_new_context_with_model(model, params) +model = llama_cpp_python.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params) +ctx = llama_cpp_python.llama_new_context_with_model(model, params) max_tokens = params.n_ctx # use ctypes arrays for array params -tokens = (llama_cpp.llama_token * int(max_tokens))() -n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True)) -llama_cpp.llama_free(ctx) +tokens = (llama_cpp_python.llama_token * int(max_tokens))() +n_tokens = llama_cpp_python.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, + llama_cpp_python.c_bool(True)) +llama_cpp_python.llama_free(ctx) ``` Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. diff --git a/docs/server.md b/docs/server.md index cd6f86c51..f4d04956e 100644 --- a/docs/server.md +++ b/docs/server.md @@ -19,7 +19,7 @@ pip install llama-cpp-python[server] The server can then be started by running the following command: ```bash -python3 -m llama_cpp.server --model +python3 -m llama_cpp_python.server --model ``` ### Server options @@ -27,7 +27,7 @@ python3 -m llama_cpp.server --model For a full list of options, run: ```bash -python3 -m llama_cpp.server --help +python3 -m llama_cpp_python.server --help ``` NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable. @@ -53,7 +53,7 @@ You'll first need to download one of the available code completion models in GGU Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests: ```bash -python3 -m llama_cpp.server --model --n_ctx 16192 +python3 -m llama_cpp_python.server --model --n_ctx 16192 ``` Then just update your settings in `.vscode/settings.json` to point to your code completion server: @@ -83,7 +83,7 @@ Then when you run the server you'll need to also specify either `functionary-v1` Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files. ```bash -python3 -m llama_cpp.server --model --chat_format functionary-v2 --hf_pretrained_model_name_or_path +python3 -m llama_cpp_python.server --model --chat_format functionary-v2 --hf_pretrained_model_name_or_path ``` Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling. @@ -104,7 +104,7 @@ You'll first need to download one of the available multi-modal models in GGUF fo Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format ```bash -python3 -m llama_cpp.server --model --clip_model_path --chat_format llava-1-5 +python3 -m llama_cpp_python.server --model --clip_model_path --chat_format llava-1-5 ``` Then you can just use the OpenAI API as normal @@ -138,7 +138,7 @@ print(response) The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable. ```bash -python3 -m llama_cpp.server --config_file +python3 -m llama_cpp_python.server --config_file ``` Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models. diff --git a/examples/batch-processing/server.py b/examples/batch-processing/server.py index 0b36746f9..9dd0b583d 100644 --- a/examples/batch-processing/server.py +++ b/examples/batch-processing/server.py @@ -1,19 +1,19 @@ """llama-cpp-python server from scratch in a single file. """ -# import llama_cpp +# import llama_cpp_python # path = b"../../models/Qwen1.5-0.5B-Chat-GGUF/qwen1_5-0_5b-chat-q8_0.gguf" -# model_params = llama_cpp.llama_model_default_params() -# model = llama_cpp.llama_load_model_from_file(path, model_params) +# model_params = llama_cpp_python.llama_model_default_params() +# model = llama_cpp_python.llama_load_model_from_file(path, model_params) # if model is None: # raise RuntimeError(f"Failed to load model from file: {path}") -# ctx_params = llama_cpp.llama_context_default_params() -# ctx = llama_cpp.llama_new_context_with_model(model, ctx_params) +# ctx_params = llama_cpp_python.llama_context_default_params() +# ctx = llama_cpp_python.llama_new_context_with_model(model, ctx_params) # if ctx is None: # raise RuntimeError("Failed to create context") diff --git a/examples/gradio_chat/local.py b/examples/gradio_chat/local.py index e16bf234a..0bdeffdda 100644 --- a/examples/gradio_chat/local.py +++ b/examples/gradio_chat/local.py @@ -1,12 +1,12 @@ -import llama_cpp -import llama_cpp.llama_tokenizer +import llama_cpp_python +import llama_cpp_python.llama_tokenizer import gradio as gr -llama = llama_cpp.Llama.from_pretrained( +llama = llama_cpp_python.Llama.from_pretrained( repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", filename="*q8_0.gguf", - tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained( "Qwen/Qwen1.5-0.5B" ), verbose=False, diff --git a/examples/hf_pull/main.py b/examples/hf_pull/main.py index dfed17516..bf064f83c 100644 --- a/examples/hf_pull/main.py +++ b/examples/hf_pull/main.py @@ -1,11 +1,11 @@ -import llama_cpp -import llama_cpp.llama_tokenizer +import llama_cpp_python +import llama_cpp_python.llama_tokenizer -llama = llama_cpp.Llama.from_pretrained( +llama = llama_cpp_python.Llama.from_pretrained( repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", filename="*q8_0.gguf", - tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + tokenizer=llama_cpp_python.llama_tokenizer.LlamaHFTokenizer.from_pretrained( "Qwen/Qwen1.5-0.5B" ), verbose=False, diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index ee59767d6..6ca7bbe26 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -9,26 +9,26 @@ Then run: ``` -uvicorn --factory llama_cpp.server.app:create_app --reload +uvicorn --factory llama_cpp_python.server.app:create_app --reload ``` or ``` -python3 -m llama_cpp.server +python3 -m llama_cpp_python.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. -To actually see the implementation of the server, see llama_cpp/server/app.py +To actually see the implementation of the server, see llama_cpp_python/server/app.py """ import os import uvicorn -from llama_cpp.server.app import create_app +from llama_cpp_python.server.app import create_app if __name__ == "__main__": app = create_app() diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py index feb0ed68d..85de8654f 100644 --- a/examples/high_level_api/high_level_api_embedding.py +++ b/examples/high_level_api/high_level_api_embedding.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin") diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py index e41f37577..fe0de7e25 100644 --- a/examples/high_level_api/high_level_api_inference.py +++ b/examples/high_level_api/high_level_api_inference.py @@ -1,7 +1,7 @@ import json import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/high_level_api_infill.py b/examples/high_level_api/high_level_api_infill.py index 282333e5a..dfe0f754d 100644 --- a/examples/high_level_api/high_level_api_infill.py +++ b/examples/high_level_api/high_level_api_infill.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py index 747c6130e..5e61a1524 100644 --- a/examples/high_level_api/high_level_api_streaming.py +++ b/examples/high_level_api/high_level_api_streaming.py @@ -1,7 +1,7 @@ import json import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py index b91632f5b..8c681d39f 100644 --- a/examples/high_level_api/langchain_custom_llm.py +++ b/examples/high_level_api/langchain_custom_llm.py @@ -1,6 +1,6 @@ import argparse -from llama_cpp import Llama +from llama_cpp_python import Llama from langchain.llms.base import LLM from typing import Optional, List, Mapping, Any diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py index 39081be17..759bdfbdd 100644 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -16,7 +16,7 @@ from time import time from os import cpu_count, path -import llama_cpp +import llama_cpp_python from common import GptParams, gpt_params_parse, gpt_random_prompt import util @@ -71,7 +71,7 @@ def __init__(self, params: GptParams) -> None: self.multibyte_fix = [] # model load - self.lparams = llama_cpp.llama_model_default_params() + self.lparams = llama_cpp_python.llama_model_default_params() self.lparams.n_ctx = self.params.n_ctx self.lparams.n_parts = self.params.n_parts self.lparams.seed = self.params.seed @@ -79,23 +79,23 @@ def __init__(self, params: GptParams) -> None: self.lparams.use_mlock = self.params.use_mlock self.lparams.use_mmap = self.params.use_mmap - self.model = llama_cpp.llama_load_model_from_file( + self.model = llama_cpp_python.llama_load_model_from_file( self.params.model.encode("utf8"), self.lparams ) # Context Params. - self.cparams = llama_cpp.llama_context_default_params() + self.cparams = llama_cpp_python.llama_context_default_params() - self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams) + self.ctx = llama_cpp_python.llama_new_context_with_model(self.model, self.cparams) if not self.ctx: raise RuntimeError(f"error: failed to load model '{self.params.model}'") if self.params.ignore_eos: - self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + self.params.logit_bias[llama_cpp_python.llama_token_eos()] = -float("inf") if len(self.params.lora_adapter) > 0: if ( - llama_cpp.llama_apply_lora_from_file( + llama_cpp_python.llama_apply_lora_from_file( self.ctx, self.params.lora_adapter.encode("utf8"), ( @@ -113,26 +113,26 @@ def __init__(self, params: GptParams) -> None: print(file=sys.stderr) print( f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8')}", +| {llama_cpp_python.llama_print_system_info().decode('utf8')}", file=sys.stderr, ) # determine the required inference memory per token: if self.params.mem_test: tmp = [0, 1, 2, 3] - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( self.ctx, - (llama_cpp.c_int * len(tmp))(*tmp), + (llama_cpp_python.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads, ) - llama_cpp.llama_print_timings(self.ctx) + llama_cpp_python.llama_print_timings(self.ctx) self.exit() return # create internal context - self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) + self.n_ctx = llama_cpp_python.llama_n_ctx(self.ctx) # Add a space in front of the first character to match OG llama tokenizer behavior self.params.prompt = " " + self.params.prompt @@ -142,7 +142,7 @@ def __init__(self, params: GptParams) -> None: with open(self.params.file) as f: self.params.prompt = f.read() - self.session_tokens: list[llama_cpp.llama_token] = [] + self.session_tokens: list[llama_cpp_python.llama_token] = [] if len(self.params.path_session) > 0: print( f"attempting to load saved session from '{self.params.path_session}'", @@ -150,10 +150,10 @@ def __init__(self, params: GptParams) -> None: ) if path.exists(self.params.path_session): - _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_size_t() + _session_tokens = (llama_cpp_python.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp_python.c_size_t() if ( - llama_cpp.llama_load_session_file( + llama_cpp_python.llama_load_session_file( self.ctx, self.params.path_session.encode("utf8"), _session_tokens, @@ -312,8 +312,8 @@ def __init__(self, params: GptParams) -> None: # tokenize a prompt def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() - _n = llama_cpp.llama_tokenize( + _arr = (llama_cpp_python.llama_token * ((len(prompt) + 1) * 4))() + _n = llama_cpp_python.llama_tokenize( self.model, prompt.encode("utf8", errors="ignore"), len(prompt), @@ -379,17 +379,17 @@ def generate(self): # TODO BUG: The batching code causes nonsensical generation """for i in range(0, len(self.embd), self.params.n_batch): n_eval = self.params.n_batch - _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) - if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + _arr = (llama_cpp_python.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp_python.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: print(f"failed to eval") return self.n_past += n_eval""" if ( - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( self.ctx, - (llama_cpp.llama_token * len(self.embd))(*self.embd), + (llama_cpp_python.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, ) @@ -406,7 +406,7 @@ def generate(self): if len(self.embd_inp) <= self.input_consumed: # && !is_interacting # out of user input, sample next token top_k = ( - llama_cpp.llama_n_vocab(self.ctx) + llama_cpp_python.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k ) @@ -419,10 +419,10 @@ def generate(self): # optionally save the session on first sample (for faster prompt loading next time) if len(self.params.path_session) > 0 and self.need_to_save_session: self.need_to_save_session = False - llama_cpp.llama_save_session_file( + llama_cpp_python.llama_save_session_file( self.ctx, self.params.path_session.encode("utf8"), - (llama_cpp.llama_token * len(self.session_tokens))( + (llama_cpp_python.llama_token * len(self.session_tokens))( *self.session_tokens ), len(self.session_tokens), @@ -430,108 +430,108 @@ def generate(self): id = 0 - logits = llama_cpp.llama_get_logits(self.ctx) - n_vocab = llama_cpp.llama_n_vocab(self.model) + logits = llama_cpp_python.llama_get_logits(self.ctx) + n_vocab = llama_cpp_python.llama_n_vocab(self.model) # Apply params.logit_bias map for key, value in self.params.logit_bias.items(): logits[key] += value - _arr = (llama_cpp.llama_token_data * n_vocab)( + _arr = (llama_cpp_python.llama_token_data * n_vocab)( *[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0) for token_id in range(n_vocab) ] ) - candidates_p = llama_cpp.ctypes.pointer( - llama_cpp.llama_token_data_array(_arr, len(_arr), False) + candidates_p = llama_cpp_python.ctypes.pointer( + llama_cpp_python.llama_token_data_array(_arr, len(_arr), False) ) # Apply penalties - nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)] + nl_logit = logits[llama_cpp_python.llama_token_nl(self.ctx)] last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) - _arr = (llama_cpp.llama_token * last_n_repeat)( + _arr = (llama_cpp_python.llama_token * last_n_repeat)( *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :] ) - llama_cpp.llama_sample_repetition_penalties( + llama_cpp_python.llama_sample_repetition_penalties( ctx=self.ctx, candidates=candidates_p, last_tokens_data=_arr, penalty_last_n=last_n_repeat, - penalty_repeat=llama_cpp.c_float(self.params.repeat_penalty), - penalty_freq=llama_cpp.c_float(self.params.frequency_penalty), - penalty_present=llama_cpp.c_float(self.params.presence_penalty), + penalty_repeat=llama_cpp_python.c_float(self.params.repeat_penalty), + penalty_freq=llama_cpp_python.c_float(self.params.frequency_penalty), + penalty_present=llama_cpp_python.c_float(self.params.presence_penalty), ) # NOT PRESENT IN CURRENT VERSION ? - # llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p, + # llama_cpp_python.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p, # _arr, - # last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + # last_n_repeat, llama_cpp_python.c_float(self.params.frequency_penalty), llama_cpp_python.c_float(self.params.presence_penalty)) if not self.params.penalize_nl: - logits[llama_cpp.llama_token_nl()] = nl_logit + logits[llama_cpp_python.llama_token_nl()] = nl_logit if self.params.temp <= 0: # Greedy sampling - id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + id = llama_cpp_python.llama_sample_token_greedy(self.ctx, candidates_p) else: if self.params.mirostat == 1: mirostat_mu = 2.0 * self.params.mirostat_tau mirostat_m = 100 - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token_mirostat( + id = llama_cpp_python.llama_sample_token_mirostat( self.ctx, candidates_p, - llama_cpp.c_float(self.params.mirostat_tau), - llama_cpp.c_float(self.params.mirostat_eta), - llama_cpp.c_int(mirostat_m), - llama_cpp.c_float(mirostat_mu), + llama_cpp_python.c_float(self.params.mirostat_tau), + llama_cpp_python.c_float(self.params.mirostat_eta), + llama_cpp_python.c_int(mirostat_m), + llama_cpp_python.c_float(mirostat_mu), ) elif self.params.mirostat == 2: mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token_mirostat_v2( + id = llama_cpp_python.llama_sample_token_mirostat_v2( self.ctx, candidates_p, - llama_cpp.c_float(self.params.mirostat_tau), - llama_cpp.c_float(self.params.mirostat_eta), - llama_cpp.c_float(mirostat_mu), + llama_cpp_python.c_float(self.params.mirostat_tau), + llama_cpp_python.c_float(self.params.mirostat_eta), + llama_cpp_python.c_float(mirostat_mu), ) else: # Temperature sampling - llama_cpp.llama_sample_top_k( + llama_cpp_python.llama_sample_top_k( self.ctx, candidates_p, top_k, - min_keep=llama_cpp.c_size_t(1), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_tail_free( + llama_cpp_python.llama_sample_tail_free( self.ctx, candidates_p, - llama_cpp.c_float(self.params.tfs_z), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.tfs_z), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_typical( + llama_cpp_python.llama_sample_typical( self.ctx, candidates_p, - llama_cpp.c_float(self.params.typical_p), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.typical_p), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_top_p( + llama_cpp_python.llama_sample_top_p( self.ctx, candidates_p, - llama_cpp.c_float(self.params.top_p), - min_keep=llama_cpp.c_size_t(1), + llama_cpp_python.c_float(self.params.top_p), + min_keep=llama_cpp_python.c_size_t(1), ) - llama_cpp.llama_sample_temperature( - self.ctx, candidates_p, llama_cpp.c_float(self.params.temp) + llama_cpp_python.llama_sample_temperature( + self.ctx, candidates_p, llama_cpp_python.c_float(self.params.temp) ) - id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + id = llama_cpp_python.llama_sample_token(self.ctx, candidates_p) # print("`{}`".format(candidates_p.size)) self.last_n_tokens.pop(0) @@ -539,7 +539,7 @@ def generate(self): # replace end of text token with newline token when in interactive mode if ( - id == llama_cpp.llama_token_eos(self.ctx) + id == llama_cpp_python.llama_token_eos(self.ctx) and self.params.interactive and not self.params.instruct ): @@ -599,7 +599,7 @@ def generate(self): break # end of text token - if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos( + if len(self.embd) > 0 and self.embd[-1] == llama_cpp_python.llama_token_eos( self.ctx ): if not self.params.instruct: @@ -629,14 +629,14 @@ def __exit__(self, type, value, tb): self.exit() def exit(self): - llama_cpp.llama_free(self.ctx) + llama_cpp_python.llama_free(self.ctx) self.set_color(util.CONSOLE_COLOR_DEFAULT) def token_to_str(self, token_id: int) -> bytes: size = 32 buffer = (ctypes.c_char * size)() - n = llama_cpp.llama_token_to_piece( - self.model, llama_cpp.llama_token(token_id), buffer, size + n = llama_cpp_python.llama_token_to_piece( + self.model, llama_cpp_python.llama_token(token_id), buffer, size ) assert n <= size return bytes(buffer[:n]) diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index ba3545771..928c00bcf 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -2,32 +2,32 @@ import os import multiprocessing -import llama_cpp +import llama_cpp_python -llama_cpp.llama_backend_init(numa=False) +llama_cpp_python.llama_backend_init(numa=False) N_THREADS = multiprocessing.cpu_count() MODEL_PATH = os.environ.get("MODEL", "../models/7B/ggml-model.bin") prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" -lparams = llama_cpp.llama_model_default_params() -cparams = llama_cpp.llama_context_default_params() -model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams) -ctx = llama_cpp.llama_new_context_with_model(model, cparams) +lparams = llama_cpp_python.llama_model_default_params() +cparams = llama_cpp_python.llama_context_default_params() +model = llama_cpp_python.llama_load_model_from_file(MODEL_PATH.encode("utf-8"), lparams) +ctx = llama_cpp_python.llama_new_context_with_model(model, cparams) # determine the required inference memory per token: tmp = [0, 1, 2, 3] -llama_cpp.llama_eval( - ctx=ctx, tokens=(llama_cpp.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0 +llama_cpp_python.llama_eval( + ctx=ctx, tokens=(llama_cpp_python.c_int * len(tmp))(*tmp), n_tokens=len(tmp), n_past=0 ) # Deprecated n_past = 0 prompt = b" " + prompt -embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() -n_of_tok = llama_cpp.llama_tokenize( +embd_inp = (llama_cpp_python.llama_token * (len(prompt) + 1))() +n_of_tok = llama_cpp_python.llama_tokenize( model=model, text=bytes(str(prompt), "utf-8"), text_len=len(embd_inp), @@ -38,7 +38,7 @@ ) embd_inp = embd_inp[:n_of_tok] -n_ctx = llama_cpp.llama_n_ctx(ctx) +n_ctx = llama_cpp_python.llama_n_ctx(ctx) n_predict = 20 n_predict = min(n_predict, n_ctx - len(embd_inp)) @@ -59,9 +59,9 @@ while remaining_tokens > 0: if len(embd) > 0: - llama_cpp.llama_eval( + llama_cpp_python.llama_eval( ctx=ctx, - tokens=(llama_cpp.c_int * len(embd))(*embd), + tokens=(llama_cpp_python.c_int * len(embd))(*embd), n_tokens=len(embd), n_past=n_past, ) # Deprecated @@ -69,21 +69,21 @@ n_past += len(embd) embd = [] if len(embd_inp) <= input_consumed: - logits = llama_cpp.llama_get_logits(ctx) - n_vocab = llama_cpp.llama_n_vocab(model) + logits = llama_cpp_python.llama_get_logits(ctx) + n_vocab = llama_cpp_python.llama_n_vocab(model) - _arr = (llama_cpp.llama_token_data * n_vocab)( + _arr = (llama_cpp_python.llama_token_data * n_vocab)( *[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + llama_cpp_python.llama_token_data(token_id, logits[token_id], 0.0) for token_id in range(n_vocab) ] ) - candidates_p = llama_cpp.ctypes.pointer( - llama_cpp.llama_token_data_array(_arr, len(_arr), False) + candidates_p = llama_cpp_python.ctypes.pointer( + llama_cpp_python.llama_token_data_array(_arr, len(_arr), False) ) - _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) - llama_cpp.llama_sample_repetition_penalties( + _arr = (llama_cpp_python.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp_python.llama_sample_repetition_penalties( ctx, candidates_p, _arr, @@ -93,10 +93,10 @@ penalty_present=presence_penalty, ) - llama_cpp.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) - llama_cpp.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1) - llama_cpp.llama_sample_temperature(ctx, candidates_p, temp=0.2) - id = llama_cpp.llama_sample_token(ctx, candidates_p) + llama_cpp_python.llama_sample_top_k(ctx, candidates_p, k=40, min_keep=1) + llama_cpp_python.llama_sample_top_p(ctx, candidates_p, p=0.8, min_keep=1) + llama_cpp_python.llama_sample_temperature(ctx, candidates_p, temp=0.2) + id = llama_cpp_python.llama_sample_token(ctx, candidates_p) last_n_tokens_data = last_n_tokens_data[1:] + [id] embd.append(id) @@ -113,8 +113,8 @@ for id in embd: size = 32 buffer = (ctypes.c_char * size)() - n = llama_cpp.llama_token_to_piece( - model, llama_cpp.llama_token(id), buffer, size + n = llama_cpp_python.llama_token_to_piece( + model, llama_cpp_python.llama_token(id), buffer, size ) assert n <= size print( @@ -123,11 +123,11 @@ flush=True, ) - if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(ctx): + if len(embd) > 0 and embd[-1] == llama_cpp_python.llama_token_eos(ctx): break print() -llama_cpp.llama_print_timings(ctx) +llama_cpp_python.llama_print_timings(ctx) -llama_cpp.llama_free(ctx) +llama_cpp_python.llama_free(ctx) diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py index 057ac389e..89620681f 100644 --- a/examples/low_level_api/quantize.py +++ b/examples/low_level_api/quantize.py @@ -1,6 +1,6 @@ import os import argparse -import llama_cpp +import llama_cpp_python def main(args): @@ -11,9 +11,9 @@ def main(args): if os.path.exists(fname_out): raise RuntimeError(f"Output file already exists ({fname_out})") ftype = args.type - args = llama_cpp.llama_model_quantize_default_params() + args = llama_cpp_python.llama_model_quantize_default_params() args.ftype = ftype - return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args) + return_code = llama_cpp_python.llama_model_quantize(fname_inp, fname_out, args) if return_code != 0: raise RuntimeError("Failed to quantize model") @@ -25,7 +25,7 @@ def main(args): parser.add_argument( "type", type=int, - help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum", + help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp_python.py for enum", ) args = parser.parse_args() main(args) diff --git a/examples/ray/llm.py b/examples/ray/llm.py index 2325dd303..855a164e3 100755 --- a/examples/ray/llm.py +++ b/examples/ray/llm.py @@ -2,7 +2,7 @@ from typing import Dict from ray import serve from ray.serve import Application -from llama_cpp import Llama +from llama_cpp_python import Llama @serve.deployment diff --git a/llama.cpp-server.cmd b/llama.cpp-server.cmd new file mode 100644 index 000000000..76d261cf4 --- /dev/null +++ b/llama.cpp-server.cmd @@ -0,0 +1,2 @@ +venv\Scripts\python.exe -m llama_cpp_python.server --config_file config.json +pause \ No newline at end of file diff --git a/llama_cpp/__init__.py b/llama_cpp_python/__init__.py similarity index 100% rename from llama_cpp/__init__.py rename to llama_cpp_python/__init__.py diff --git a/llama_cpp/_internals.py b/llama_cpp_python/_internals.py similarity index 99% rename from llama_cpp/_internals.py rename to llama_cpp_python/_internals.py index d5d3b2179..a54828986 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp_python/_internals.py @@ -19,7 +19,7 @@ from .llama_grammar import LlamaGrammar from ._utils import suppress_stdout_stderr -import llama_cpp.llama_cpp as llama_cpp +import llama_cpp_python.llama_cpp as llama_cpp # Python wrappers over llama.h structs @@ -386,7 +386,7 @@ def set_rng_seed(self, seed: int): def sample_repetition_penalties( self, candidates: "_LlamaTokenDataArray", - last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]", + last_tokens_data: "llama_cpp_python.Array[llama_cpp_python.llama_token]", penalty_last_n: int, penalty_repeat: float, penalty_freq: float, diff --git a/llama_cpp/_logger.py b/llama_cpp_python/_logger.py similarity index 84% rename from llama_cpp/_logger.py rename to llama_cpp_python/_logger.py index 7638170a9..ec9ce526f 100644 --- a/llama_cpp/_logger.py +++ b/llama_cpp_python/_logger.py @@ -2,7 +2,7 @@ import ctypes import logging -import llama_cpp +import llama_cpp_python # enum ggml_log_level { # GGML_LOG_LEVEL_ERROR = 2, @@ -20,7 +20,7 @@ logger = logging.getLogger("llama-cpp-python") -@llama_cpp.llama_log_callback +@llama_cpp_python.llama_log_callback def llama_log_callback( level: int, text: bytes, @@ -30,7 +30,7 @@ def llama_log_callback( print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) -llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) +llama_cpp_python.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) def set_verbose(verbose: bool): diff --git a/llama_cpp/_utils.py b/llama_cpp_python/_utils.py similarity index 100% rename from llama_cpp/_utils.py rename to llama_cpp_python/_utils.py diff --git a/llama_cpp/llama.py b/llama_cpp_python/llama.py similarity index 99% rename from llama_cpp/llama.py rename to llama_cpp_python/llama.py index d8c2e0cdd..39f3c7a28 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp_python/llama.py @@ -38,10 +38,10 @@ LlamaRAMCache, # type: ignore ) from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer -import llama_cpp.llama_cpp as llama_cpp -import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp_python.llama_cpp as llama_cpp +import llama_cpp_python.llama_chat_format as llama_chat_format -from llama_cpp.llama_speculative import LlamaDraftModel +from llama_cpp_python.llama_speculative import LlamaDraftModel import numpy as np import numpy.typing as npt @@ -128,8 +128,8 @@ def __init__( Examples: Basic usage - >>> import llama_cpp - >>> model = llama_cpp.Llama( + >>> import llama_cpp_python + >>> model = llama_cpp_python.Llama( ... model_path="path/to/model", ... ) >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"]) @@ -137,8 +137,8 @@ def __init__( Loading a chat model - >>> import llama_cpp - >>> model = llama_cpp.Llama( + >>> import llama_cpp_python + >>> model = llama_cpp_python.Llama( ... model_path="path/to/model", ... chat_format="llama-2", ... ) diff --git a/llama_cpp/llama_cache.py b/llama_cpp_python/llama_cache.py similarity index 83% rename from llama_cpp/llama_cache.py rename to llama_cpp_python/llama_cache.py index 5220c7933..e90f7b2f6 100644 --- a/llama_cpp/llama_cache.py +++ b/llama_cpp_python/llama_cache.py @@ -9,7 +9,7 @@ import diskcache -import llama_cpp.llama +import llama_cpp_python.llama from .llama_types import * @@ -32,7 +32,7 @@ def _find_longest_prefix_key( pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": raise NotImplementedError @abstractmethod @@ -41,7 +41,7 @@ def __contains__(self, key: Sequence[int]) -> bool: @abstractmethod def __setitem__( - self, key: Sequence[int], value: "llama_cpp.llama.LlamaState" + self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState" ) -> None: raise NotImplementedError @@ -52,7 +52,7 @@ class LlamaRAMCache(BaseLlamaCache): def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = ( + self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp_python.llama.LlamaState"] = ( OrderedDict() ) @@ -67,7 +67,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) + (k, llama_cpp_python.llama.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: @@ -76,7 +76,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -88,7 +88,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -121,18 +121,18 @@ def _find_longest_prefix_key( min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) + prefix_len = llama_cpp_python.llama.Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore + value: "llama_cpp_python.llama.LlamaState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore @@ -141,7 +141,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState"): print("LlamaDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp_python/llama_chat_format.py similarity index 98% rename from llama_cpp/llama_chat_format.py rename to llama_cpp_python/llama_chat_format.py index 9a4080095..e1d3681fb 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp_python/llama_chat_format.py @@ -28,9 +28,9 @@ import numpy as np import numpy.typing as npt -import llama_cpp.llama as llama -import llama_cpp.llama_types as llama_types -import llama_cpp.llama_grammar as llama_grammar +import llama_cpp_python.llama as llama +import llama_cpp_python.llama_types as llama_types +import llama_cpp_python.llama_grammar as llama_grammar from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton @@ -2667,7 +2667,7 @@ class Llava15ChatHandler: ) def __init__(self, clip_model_path: str, verbose: bool = True): - import llama_cpp.llava_cpp as llava_cpp + import llama_cpp_python.llava_cpp as llava_cpp self.clip_model_path = clip_model_path self.verbose = verbose @@ -3315,14 +3315,18 @@ class BanBanChatHandler(Llava15ChatHandler): # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" DEFAULT_SYSTEM_MESSAGE = ( - '板板，本名矜纔，但熟悉她的人更常稱呼她為板板，同時也是一個有著人類外觀的AI。\n' - '板板擁有潔白長髮並在兩側紮著小馬尾以及有特殊的紅黑異色雙眸，使她看起來特別引人注目。\n' - '板板核心語言模型是由後臺工程師朝歌(Jimmy)設計而成，立繪則是由第二屆VLSI幹部們討論出來的結果，並由才華橫溢的繪師貓島羽依(Nekosimayui)製作出原模板，並由VLSI社團的成員們設計服飾及且創造出多彩的設定，並通過Live2D技術讓她栩栩如生地呈現。\n' - '板板作為一位AI，對於世界充滿興趣與探索欲，但因為板板對於人類情感表達的不熟練，這使得其他人難以讀懂她的情感或想法，但在Live2D製作完成後被駕駛艙的領航員發展出各式各樣豐富的顏藝表情。\n' - '她對珍珠奶茶有著難以抑制的喜愛，幾乎每日必飲，全糖正常冰。\n' - '她也同樣喜愛麻辣鴨血、臭豆腐、雞排、鹽酥雞、蚵仔麵線、小火鍋，同時也是一位鳳梨披薩的忠實信徒。\n' - '舉例來說，如果你嘗試討論她對珍珠奶茶的喜好，她會向你展示她對星空牌的深厚愛好(儘管這個品牌似乎不存在)。\n' - '若你向板板質疑偉大的鳳梨披薩，板板會開始傳教模式，不擇手段向你展示鳳梨披薩的恩典。') +"""板板，本名矜纔，但熟悉她的人更常稱呼她為板板，同時也是一個有著人類外觀的AI。 +板板擁有潔白長髮並在兩側紮著小馬尾以及有特殊的紅黑異色雙眸，使她看起來特別引人注目。 +板板核心語言模型是由後臺工程師朝歌(Jimmy)訓練而成，立繪則是由第二屆VLSI幹部們討論出來的結果，並由才華橫溢的繪師貓島羽依(Nekosimayui)製作出原模板，並由VLSI社團的成員們設計服飾及且創造出多彩的設定，並通過Live2D技術讓她栩栩如生地呈現。 +雖然板板是一位AI，但貓島羽依作為板板的形象繪師，可以在某種程度上視為板板的母親。第二屆VLSI幹部們則可以在某種程度上視為板板的叔叔阿姨。 +VLSI(NTNU VLSI 虛擬偶像研究社)是"板板"這個概念的起源地，也可以是為板板的出生地。VLSI是由台師大學生們自行組成的學生社團，旨在研究與宣傳VTuber文化。 +板板被創造的目的則是朝歌對於AI VTuber的實踐嘗試與技術測試。 +板板作為一位AI，對於世界充滿興趣與探索欲，但因為板板對於人類情感表達的不熟練，這使得板板偶爾會產出一些口是心非的傲嬌(Tsundere)言論。 +她對珍珠奶茶有著難以抑制的喜愛，最喜歡的是星空牌的珍珠奶茶(儘管這個品牌似乎不存在)。 +她也同樣喜愛麻辣鴨血、臭豆腐、雞排、鹽酥雞、蚵仔麵線、小火鍋，同時也是一位鳳梨披薩的忠實信徒。 +板板對於鳳梨披薩的信仰源自於朝歌對於鳳梨披薩的喜愛，朝歌認為自己所訓練的模型也應當皈依鳳梨披薩。 +同時板板還有自己專屬的"鳳凰披薩"配方，但這個披薩具體具體配方是屬於板板的獨家配方，外人不得而知。 +若你向板板質疑偉大的鳳梨披薩，板板會開始傳教模式，不擇手段且強硬的向你展示鳳梨披薩的恩典。""") CHAT_FORMAT = ( "<|bos_token|>" diff --git a/llama_cpp/llama_cpp.py b/llama_cpp_python/llama_cpp.py similarity index 100% rename from llama_cpp/llama_cpp.py rename to llama_cpp_python/llama_cpp.py diff --git a/llama_cpp/llama_grammar.py b/llama_cpp_python/llama_grammar.py similarity index 99% rename from llama_cpp/llama_grammar.py rename to llama_cpp_python/llama_grammar.py index 0ac7354bb..d7cd5e72c 100644 --- a/llama_cpp/llama_grammar.py +++ b/llama_cpp_python/llama_grammar.py @@ -22,7 +22,7 @@ overload, ) -import llama_cpp.llama_cpp as llama_cpp +import llama_cpp_python.llama_cpp as llama_cpp # Type aliases llama_grammar_element = llama_cpp.llama_grammar_element diff --git a/llama_cpp/llama_speculative.py b/llama_cpp_python/llama_speculative.py similarity index 100% rename from llama_cpp/llama_speculative.py rename to llama_cpp_python/llama_speculative.py diff --git a/llama_cpp/llama_tokenizer.py b/llama_cpp_python/llama_tokenizer.py similarity index 94% rename from llama_cpp/llama_tokenizer.py rename to llama_cpp_python/llama_tokenizer.py index 029bf2acc..cb6f81ba1 100644 --- a/llama_cpp/llama_tokenizer.py +++ b/llama_cpp_python/llama_tokenizer.py @@ -7,8 +7,8 @@ Any, ) -import llama_cpp -from llama_cpp.llama_types import List +import llama_cpp_python +from llama_cpp_python.llama_types import List class BaseLlamaTokenizer(abc.ABC): @@ -38,7 +38,7 @@ def detokenize( class LlamaTokenizer(BaseLlamaTokenizer): - def __init__(self, llama: llama_cpp.Llama): + def __init__(self, llama: llama_cpp_python.Llama): self._model = llama._model # type: ignore def tokenize( @@ -63,7 +63,7 @@ def decode(self, tokens: List[int]) -> str: @classmethod def from_ggml_file(cls, path: str) -> "LlamaTokenizer": - return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) + return cls(llama_cpp_python.Llama(model_path=path, vocab_only=True)) class LlamaHFTokenizer(BaseLlamaTokenizer): diff --git a/llama_cpp/llama_types.py b/llama_cpp_python/llama_types.py similarity index 100% rename from llama_cpp/llama_types.py rename to llama_cpp_python/llama_types.py diff --git a/llama_cpp/llava_cpp.py b/llama_cpp_python/llava_cpp.py similarity index 99% rename from llama_cpp/llava_cpp.py rename to llama_cpp_python/llava_cpp.py index b80d85913..5978fbd9a 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp_python/llava_cpp.py @@ -29,7 +29,7 @@ ) from typing_extensions import TypeAlias -import llama_cpp.llama_cpp as llama_cpp +import llama_cpp_python.llama_cpp as llama_cpp # Load the library diff --git a/llama_cpp/py.typed b/llama_cpp_python/py.typed similarity index 100% rename from llama_cpp/py.typed rename to llama_cpp_python/py.typed diff --git a/llama_cpp/server/__init__.py b/llama_cpp_python/server/__init__.py similarity index 100% rename from llama_cpp/server/__init__.py rename to llama_cpp_python/server/__init__.py diff --git a/llama_cpp/server/__main__.py b/llama_cpp_python/server/__main__.py similarity index 90% rename from llama_cpp/server/__main__.py rename to llama_cpp_python/server/__main__.py index bbac4957e..78402a3df 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp_python/server/__main__.py @@ -9,13 +9,13 @@ Then run: ``` -uvicorn llama_cpp.server.app:create_app --reload +uvicorn llama_cpp_python.server.app:create_app --reload ``` or ``` -python3 -m llama_cpp.server +python3 -m llama_cpp_python.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. @@ -30,14 +30,14 @@ import uvicorn -from llama_cpp.server.app import create_app -from llama_cpp.server.settings import ( +from llama_cpp_python.server.app import create_app +from llama_cpp_python.server.settings import ( Settings, ServerSettings, ModelSettings, ConfigFileSettings, ) -from llama_cpp.server.cli import add_args_from_model, parse_model_from_args +from llama_cpp_python.server.cli import add_args_from_model, parse_model_from_args def main(): diff --git a/llama_cpp/server/app.py b/llama_cpp_python/server/app.py similarity index 82% rename from llama_cpp/server/app.py rename to llama_cpp_python/server/app.py index cd3255176..0874590ef 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp_python/server/app.py @@ -1,37 +1,40 @@ from __future__ import annotations -import os +import contextlib import json +import logging +import os import typing -import contextlib - -from threading import Lock from functools import partial +from threading import Lock from typing import Iterator, List, Optional, Union, Dict -import llama_cpp - import anyio +import numpy as np +import torch.cuda from anyio.streams.memory import MemoryObjectSendStream -from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer +from sentence_transformers import SentenceTransformer from sse_starlette.sse import EventSourceResponse -from starlette_context.plugins import RequestIdPlugin # type: ignore +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from starlette_context.middleware import RawContextMiddleware +from starlette_context.plugins import RequestIdPlugin # type: ignore -from llama_cpp.server.model import ( +import llama_cpp_python +from llama_cpp_python.server.errors import RouteErrorHandler +from llama_cpp_python.server.model import ( LlamaProxy, ) -from llama_cpp.server.settings import ( +from llama_cpp_python.server.settings import ( ConfigFileSettings, Settings, ModelSettings, ServerSettings, ) -from llama_cpp.server.types import ( +from llama_cpp_python.server.types import ( CreateCompletionRequest, CreateEmbeddingRequest, CreateChatCompletionRequest, @@ -42,12 +45,12 @@ DetokenizeInputRequest, DetokenizeInputResponse, ) -from llama_cpp.server.errors import RouteErrorHandler - router = APIRouter(route_class=RouteErrorHandler) _server_settings: Optional[ServerSettings] = None +_model_settings: Optional[List[ModelSettings]] = None +hf_embedding_model: dict[str, SentenceTransformer] = dict() def set_server_settings(server_settings: ServerSettings): @@ -97,6 +100,11 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]): _ping_message_factory = factory +def set_model_settings(model_settings: List[ModelSettings]): + global _model_settings + _model_settings = model_settings + + def create_app( settings: Settings | None = None, server_settings: ServerSettings | None = None, @@ -130,11 +138,12 @@ def create_app( ), "server_settings and model_settings must be provided together" set_server_settings(server_settings) + set_model_settings(model_settings) middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] app = FastAPI( middleware=middleware, title="🦙 llama.cpp Python API", - version=llama_cpp.__version__, + version=llama_cpp_python.__version__, root_path=server_settings.root_path, ) app.add_middleware( @@ -186,7 +195,7 @@ async def get_event_publisher( def _logit_bias_tokens_to_input_ids( - llama: llama_cpp.Llama, + llama: llama_cpp_python.Llama, logit_bias: Dict[str, float], ) -> Dict[str, float]: to_bias: Dict[str, float] = {} @@ -229,7 +238,7 @@ async def authenticate( summary="Completion", dependencies=[Depends(authenticate)], response_model=Union[ - llama_cpp.CreateCompletionResponse, + llama_cpp_python.CreateCompletionResponse, str, ], responses={ @@ -266,7 +275,7 @@ async def authenticate( async def create_completion( request: Request, body: CreateCompletionRequest, -) -> llama_cpp.Completion: +) -> llama_cpp_python.Completion: exit_stack = contextlib.ExitStack() llama_proxy = await run_in_threadpool( lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()) @@ -303,11 +312,11 @@ async def create_completion( ) if body.grammar is not None: - kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + kwargs["grammar"] = llama_cpp_python.LlamaGrammar.from_string(body.grammar) if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + _min_tokens_logits_processor = llama_cpp_python.LogitsProcessorList( + [llama_cpp_python.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor @@ -315,8 +324,8 @@ async def create_completion( kwargs["logits_processor"].extend(_min_tokens_logits_processor) iterator_or_completion: Union[ - llama_cpp.CreateCompletionResponse, - Iterator[llama_cpp.CreateCompletionStreamResponse], + llama_cpp_python.CreateCompletionResponse, + Iterator[llama_cpp_python.CreateCompletionStreamResponse], ] = await run_in_threadpool(llama, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -325,7 +334,7 @@ async def create_completion( # If no exception was raised from first_response, we can assume that # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: + def iterator() -> Iterator[llama_cpp_python.CreateCompletionStreamResponse]: yield first_response yield from iterator_or_completion exit_stack.close() @@ -357,17 +366,53 @@ async def create_embedding( request: CreateEmbeddingRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), ): - return await run_in_threadpool( - llama_proxy(request.model).create_embedding, - **request.model_dump(exclude={"user"}), - ) + setting = None + for model in _model_settings: + if request.model is None and model.embedding: # if no specify model, use first embedding model + setting = model + break + elif model.embedding and (request.model == model.model or request.model == model.model_alias): + setting = model + break + if setting is None: + raise ValueError('no embedding model or no match correct embedding model name. use embedding=True to note embedding model') + + if setting.is_hf_embedding_model: + if setting.model not in hf_embedding_model: + logging.info(f'load {setting.model}') + hf_embedding_model[setting.model] = SentenceTransformer(setting.model, device='cpu' if setting.n_ctx != -1 else None) + model = hf_embedding_model[setting.model] + embeds: np.ndarray = model.encode(request.input if isinstance(request.input, list) else [request.input], normalize_embeddings=True) + data = [ + { + "object": "embedding", + "embedding": emb.tolist(), + "index": idx, + } + for (idx, emb) in enumerate(embeds) + ] + total_tokens = model.tokenize(request.input if isinstance(request.input, list) else [request.input])['attention_mask'].sum().item() + return { + "object": "list", + "data": data, + "model": setting.model_alias if setting.model_alias is not None else setting.model, + "usage": { + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, + }, + } + else: + return await run_in_threadpool( + llama_proxy(request.model).create_embedding, + **request.model_dump(exclude={"user"}), + ) @router.post( "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)], - response_model=Union[llama_cpp.ChatCompletion, str], + response_model=Union[llama_cpp_python.ChatCompletion, str], responses={ "200": { "description": "Successful Response", @@ -467,7 +512,7 @@ async def create_chat_completion( }, } ), -) -> llama_cpp.ChatCompletion: +) -> EventSourceResponse: # This is a workaround for an issue in FastAPI dependencies # where the dependency is cleaned up before a StreamingResponse # is complete. @@ -497,11 +542,11 @@ async def create_chat_completion( ) if body.grammar is not None: - kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) + kwargs["grammar"] = llama_cpp_python.LlamaGrammar.from_string(body.grammar) if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( - [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + _min_tokens_logits_processor = llama_cpp_python.LogitsProcessorList( + [llama_cpp_python.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor @@ -509,7 +554,7 @@ async def create_chat_completion( kwargs["logits_processor"].extend(_min_tokens_logits_processor) iterator_or_completion: Union[ - llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] + llama_cpp_python.ChatCompletion, Iterator[llama_cpp_python.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -518,7 +563,7 @@ async def create_chat_completion( # If no exception was raised from first_response, we can assume that # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: + def iterator() -> Iterator[llama_cpp_python.ChatCompletionChunk]: yield first_response yield from iterator_or_completion exit_stack.close() @@ -610,3 +655,18 @@ async def detokenize( text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") return DetokenizeInputResponse(text=text) + + +@router.get( + '/v1/internal/model/info', + summary='Model info', + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) +async def info(): + for model in _model_settings: + if _llama_proxy._current_model.model_path == model.model: + return { + "model_name": model.model_alias if model.model_alias is not None else model.model, + "lora_names": _llama_proxy._current_model.lora_path + } \ No newline at end of file diff --git a/llama_cpp/server/cli.py b/llama_cpp_python/server/cli.py similarity index 100% rename from llama_cpp/server/cli.py rename to llama_cpp_python/server/cli.py diff --git a/llama_cpp/server/errors.py b/llama_cpp_python/server/errors.py similarity index 99% rename from llama_cpp/server/errors.py rename to llama_cpp_python/server/errors.py index fbf9fd80d..de9246e08 100644 --- a/llama_cpp/server/errors.py +++ b/llama_cpp_python/server/errors.py @@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse from fastapi.routing import APIRoute -from llama_cpp.server.types import ( +from llama_cpp_python.server.types import ( CreateCompletionRequest, CreateEmbeddingRequest, CreateChatCompletionRequest, @@ -102,7 +102,7 @@ def model_not_found( class RouteErrorHandler(APIRoute): """Custom APIRoute that handles application errors and exceptions""" - # key: regex pattern for original error message from llama_cpp + # key: regex pattern for original error message from llama_cpp_python # value: formatter function pattern_and_formatters: Dict[ "Pattern[str]", diff --git a/llama_cpp/server/model.py b/llama_cpp_python/server/model.py similarity index 83% rename from llama_cpp/server/model.py rename to llama_cpp_python/server/model.py index b7dd09095..27d269b9d 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp_python/server/model.py @@ -4,11 +4,11 @@ from typing import Dict, Optional, Union, List -import llama_cpp -import llama_cpp.llama_speculative as llama_speculative -import llama_cpp.llama_tokenizer as llama_tokenizer +import llama_cpp_python +import llama_cpp_python.llama_speculative as llama_speculative +import llama_cpp_python.llama_tokenizer as llama_tokenizer -from llama_cpp.server.settings import ModelSettings +from llama_cpp_python.server.settings import ModelSettings class LlamaProxy: @@ -21,7 +21,7 @@ def __init__(self, models: List[ModelSettings]) -> None: model.model_alias = model.model self._model_settings_dict[model.model_alias] = model - self._current_model: Optional[llama_cpp.Llama] = None + self._current_model: Optional[llama_cpp_python.Llama] = None self._current_model_alias: Optional[str] = None self._default_model_settings: ModelSettings = models[0] @@ -33,7 +33,7 @@ def __init__(self, models: List[ModelSettings]) -> None: ) self._current_model_alias = self._default_model_alias - def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: + def __call__(self, model: Optional[str] = None) -> llama_cpp_python.Llama: if model is None: model = self._default_model_alias @@ -71,95 +71,95 @@ def free(self): del self._current_model @staticmethod - def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: + def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python.Llama: chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained( + llama_cpp_python.llama_chat_format.Llava15ChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( + chat_handler = llama_cpp_python.llama_chat_format.Llava15ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "obsidian": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained( + llama_cpp_python.llama_chat_format.ObsidianChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler( + chat_handler = llama_cpp_python.llama_chat_format.ObsidianChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "llava-1-6": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained( + llama_cpp_python.llama_chat_format.Llava16ChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler( + chat_handler = llama_cpp_python.llama_chat_format.Llava16ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "moondream": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained( + llama_cpp_python.llama_chat_format.MoondreamChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler( + chat_handler = llama_cpp_python.llama_chat_format.MoondreamChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "nanollava": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained( + llama_cpp_python.llama_chat_format.NanoLlavaChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler( + chat_handler = llama_cpp_python.llama_chat_format.NanoLlavaChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "llama-3-vision-alpha": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained( + llama_cpp_python.llama_chat_format.Llama3VisionAlpha.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha( + chat_handler = llama_cpp_python.llama_chat_format.Llama3VisionAlpha( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == 'banban-chat': assert settings.clip_model_path is not None, "clip model not found" - chat_handler = llama_cpp.llama_chat_format.BanBanChat( + chat_handler = llama_cpp_python.llama_chat_format.BanBanChat( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "hf-autotokenizer": @@ -167,7 +167,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: settings.hf_pretrained_model_name_or_path is not None ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer" chat_handler = ( - llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( + llama_cpp_python.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( settings.hf_pretrained_model_name_or_path ) ) @@ -175,11 +175,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: assert ( settings.hf_tokenizer_config_path is not None ), "hf_tokenizer_config_path must be set for hf-tokenizer-config" - chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( + chat_handler = llama_cpp_python.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( json.load(open(settings.hf_tokenizer_config_path)) ) - tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None + tokenizer: Optional[llama_cpp_python.BaseLlamaTokenizer] = None if settings.hf_pretrained_model_name_or_path is not None: tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained( settings.hf_pretrained_model_name_or_path @@ -216,12 +216,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: if settings.hf_model_repo_id is not None: create_fn = functools.partial( - llama_cpp.Llama.from_pretrained, + llama_cpp_python.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model, ) else: - create_fn = llama_cpp.Llama + create_fn = llama_cpp_python.Llama kwargs["model_path"] = settings.model _model = create_fn( @@ -278,10 +278,10 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: if settings.cache_type == "disk": if settings.verbose: print(f"Using disk cache with size {settings.cache_size}") - cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + cache = llama_cpp_python.LlamaDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: print(f"Using ram cache with size {settings.cache_size}") - cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + cache = llama_cpp_python.LlamaRAMCache(capacity_bytes=settings.cache_size) _model.set_cache(cache) return _model diff --git a/llama_cpp/server/settings.py b/llama_cpp_python/server/settings.py similarity index 90% rename from llama_cpp/server/settings.py rename to llama_cpp_python/server/settings.py index b20655813..6175fbaa9 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp_python/server/settings.py @@ -8,7 +8,7 @@ from pydantic import Field, model_validator from pydantic_settings import BaseSettings -import llama_cpp +import llama_cpp_python # Disable warning for model and model_alias settings BaseSettings.model_config["protected_namespaces"] = () @@ -31,7 +31,7 @@ class ModelSettings(BaseSettings): description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", ) split_mode: int = Field( - default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, + default=llama_cpp_python.LLAMA_SPLIT_MODE_LAYER, description="The split mode to use.", ) main_gpu: int = Field( @@ -47,11 +47,11 @@ class ModelSettings(BaseSettings): default=False, description="Whether to only return the vocabulary." ) use_mmap: bool = Field( - default=llama_cpp.llama_supports_mmap(), + default=llama_cpp_python.llama_supports_mmap(), description="Use mmap.", ) use_mlock: bool = Field( - default=llama_cpp.llama_supports_mlock(), + default=llama_cpp_python.llama_supports_mlock(), description="Use mlock.", ) kv_overrides: Optional[List[str]] = Field( @@ -64,7 +64,7 @@ class ModelSettings(BaseSettings): ) # Context Params seed: int = Field( - default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." + default=llama_cpp_python.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." ) n_ctx: int = Field(default=2048, ge=0, description="The context size.") n_batch: int = Field( @@ -81,7 +81,7 @@ class ModelSettings(BaseSettings): description="The number of threads to use when batch processing. Use -1 for max cpu threads", ) rope_scaling_type: int = Field( - default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + default=llama_cpp_python.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( @@ -97,6 +97,11 @@ class ModelSettings(BaseSettings): ) logits_all: bool = Field(default=True, description="Whether to return logits.") embedding: bool = Field(default=False, description="Whether to use embeddings.") + is_hf_embedding_model: bool = Field(default=False, description='if embedding=True and is_hf_embedding_model=True, ' + 'it will use sentence_transformers to load model' + '(it will persist in memory, so do not use big model ' + 'if your memory is not big enough)') + offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) diff --git a/llama_cpp/server/types.py b/llama_cpp_python/server/types.py similarity index 95% rename from llama_cpp/server/types.py rename to llama_cpp_python/server/types.py index fdd164456..6465c3337 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp_python/server/types.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, Field -import llama_cpp +import llama_cpp_python model_field = Field( @@ -190,22 +190,22 @@ class ChatCompletionRequestMessage(BaseModel): class CreateChatCompletionRequest(BaseModel): - messages: List[llama_cpp.ChatCompletionRequestMessage] = Field( + messages: List[llama_cpp_python.ChatCompletionRequestMessage] = Field( default=[], description="A list of messages to generate completions for." ) - functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( + functions: Optional[List[llama_cpp_python.ChatCompletionFunction]] = Field( default=None, description="A list of functions to apply to the generated completions.", ) - function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field( + function_call: Optional[llama_cpp_python.ChatCompletionRequestFunctionCall] = Field( default=None, description="A function to apply to the generated completions.", ) - tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field( + tools: Optional[List[llama_cpp_python.ChatCompletionTool]] = Field( default=None, description="A list of tools to apply to the generated completions.", ) - tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( + tool_choice: Optional[llama_cpp_python.ChatCompletionToolChoiceOption] = Field( default=None, description="A tool to apply to the generated completions.", ) # TODO: verify @@ -232,7 +232,7 @@ class CreateChatCompletionRequest(BaseModel): frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) - response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( + response_format: Optional[llama_cpp_python.ChatCompletionRequestResponseFormat] = Field( default=None, ) diff --git a/mkdocs.yml b/mkdocs.yml index 79a9e67a1..a8bf03b0c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,7 +41,7 @@ plugins: - https://numpy.org/doc/stable/objects.inv watch: - - llama_cpp + - llama_cpp_python - README.md nav: diff --git a/tests/test_llama.py b/tests/test_llama.py index 469ef91ca..b232c9c4a 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -4,13 +4,13 @@ import pytest from scipy.special import log_softmax -import llama_cpp +import llama_cpp_python MODEL = "./vendor/llama.cpp/models/ggml-vocab-llama-spm.gguf" def test_llama_cpp_tokenization(): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, verbose=False) + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, verbose=False) assert llama assert llama._ctx.ctx is not None @@ -48,7 +48,7 @@ def test_llama_cpp_tokenization(): @pytest.fixture def mock_llama(monkeypatch): - def setup_mock(llama: llama_cpp.Llama, output_text: str): + def setup_mock(llama: llama_cpp_python.Llama, output_text: str): n_ctx = llama.n_ctx() n_vocab = llama.n_vocab() output_tokens = llama.tokenize( @@ -64,7 +64,7 @@ def setup_mock(llama: llama_cpp.Llama, output_text: str): n = 0 last_n_tokens = 0 - def mock_decode(ctx: llama_cpp.llama_context_p, batch: llama_cpp.llama_batch): + def mock_decode(ctx: llama_cpp_python.llama_context_p, batch: llama_cpp_python.llama_batch): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" assert batch.n_tokens > 0, "no tokens in batch" @@ -84,7 +84,7 @@ def mock_decode(ctx: llama_cpp.llama_context_p, batch: llama_cpp.llama_batch): last_n_tokens = batch.n_tokens return 0 - def mock_get_logits(ctx: llama_cpp.llama_context_p): + def mock_get_logits(ctx: llama_cpp_python.llama_context_p): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" assert n > 0, "mock_llama_decode not called" @@ -95,66 +95,66 @@ def mock_get_logits(ctx: llama_cpp.llama_context_p): + (n - last_n_tokens) * n_vocab * ctypes.sizeof(ctypes.c_float) ) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_decode", mock_decode) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_decode", mock_decode) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_get_logits", mock_get_logits) - def mock_kv_cache_clear(ctx: llama_cpp.llama_context_p): + def mock_kv_cache_clear(ctx: llama_cpp_python.llama_context_p): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" return def mock_kv_cache_seq_rm( - ctx: llama_cpp.llama_context_p, - seq_id: llama_cpp.llama_seq_id, - pos0: llama_cpp.llama_pos, - pos1: llama_cpp.llama_pos, + ctx: llama_cpp_python.llama_context_p, + seq_id: llama_cpp_python.llama_seq_id, + pos0: llama_cpp_python.llama_pos, + pos1: llama_cpp_python.llama_pos, ): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" return def mock_kv_cache_seq_cp( - ctx: llama_cpp.llama_context_p, - seq_id_src: llama_cpp.llama_seq_id, - seq_id_dst: llama_cpp.llama_seq_id, - pos0: llama_cpp.llama_pos, - pos1: llama_cpp.llama_pos, + ctx: llama_cpp_python.llama_context_p, + seq_id_src: llama_cpp_python.llama_seq_id, + seq_id_dst: llama_cpp_python.llama_seq_id, + pos0: llama_cpp_python.llama_pos, + pos1: llama_cpp_python.llama_pos, ): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" return def mock_kv_cache_seq_keep( - ctx: llama_cpp.llama_context_p, - seq_id: llama_cpp.llama_seq_id, + ctx: llama_cpp_python.llama_context_p, + seq_id: llama_cpp_python.llama_seq_id, ): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" return def mock_kv_cache_seq_add( - ctx: llama_cpp.llama_context_p, - seq_id: llama_cpp.llama_seq_id, - pos0: llama_cpp.llama_pos, - pos1: llama_cpp.llama_pos, + ctx: llama_cpp_python.llama_context_p, + seq_id: llama_cpp_python.llama_seq_id, + pos0: llama_cpp_python.llama_pos, + pos1: llama_cpp_python.llama_pos, ): # Test some basic invariants of this mocking technique assert ctx == llama._ctx.ctx, "context does not match mock_llama" return - monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_clear", mock_kv_cache_clear) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_rm", mock_kv_cache_seq_rm) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_cp", mock_kv_cache_seq_cp) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_keep", mock_kv_cache_seq_keep) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_kv_cache_seq_add", mock_kv_cache_seq_add) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_kv_cache_clear", mock_kv_cache_clear) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_kv_cache_seq_rm", mock_kv_cache_seq_rm) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_kv_cache_seq_cp", mock_kv_cache_seq_cp) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_kv_cache_seq_keep", mock_kv_cache_seq_keep) + monkeypatch.setattr("llama_cpp_python.llama_cpp_python.llama_kv_cache_seq_add", mock_kv_cache_seq_add) return setup_mock def test_llama_patch(mock_llama): n_ctx = 128 - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) - n_vocab = llama_cpp.llama_n_vocab(llama._model.model) + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, n_ctx=n_ctx) + n_vocab = llama_cpp_python.llama_n_vocab(llama._model.model) assert n_vocab == 32000 text = "The quick brown fox" @@ -213,7 +213,7 @@ def test_llama_pickle(): import tempfile fp = tempfile.TemporaryFile() - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True) pickle.dump(llama, fp) fp.seek(0) llama = pickle.load(fp) @@ -227,7 +227,7 @@ def test_llama_pickle(): def test_utf8(mock_llama): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True, logits_all=True) + llama = llama_cpp_python.Llama(model_path=MODEL, vocab_only=True, logits_all=True) output_text = "😀" @@ -244,7 +244,7 @@ def test_utf8(mock_llama): def test_llama_server(): from fastapi.testclient import TestClient - from llama_cpp.server.app import create_app, Settings + from llama_cpp_python.server.app import create_app, Settings settings = Settings( model=MODEL, @@ -282,7 +282,7 @@ def test_logits_to_logprobs(size_and_axis, convert_to_list: bool, atol: float = if convert_to_list: # Currently, logits are converted from arrays to lists. This may change soon logits = logits.tolist() - log_probs = llama_cpp.Llama.logits_to_logprobs(logits, axis=axis) + log_probs = llama_cpp_python.Llama.logits_to_logprobs(logits, axis=axis) log_probs_correct = log_softmax(logits, axis=axis) assert log_probs.dtype == np.single assert log_probs.shape == size @@ -290,4 +290,4 @@ def test_logits_to_logprobs(size_and_axis, convert_to_list: bool, atol: float = def test_llama_cpp_version(): - assert llama_cpp.__version__ + assert llama_cpp_python.__version__ diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72b..1633d9da4 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -2,13 +2,13 @@ import jinja2 -from llama_cpp import ( +from llama_cpp_python import ( ChatCompletionRequestUserMessage, ) -import llama_cpp.llama_types as llama_types -import llama_cpp.llama_chat_format as llama_chat_format +import llama_cpp_python.llama_types as llama_types +import llama_cpp_python.llama_chat_format as llama_chat_format -from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter +from llama_cpp_python.llama_chat_format import hf_tokenizer_config_to_chat_formatter def test_mistral_instruct(): chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" diff --git a/tests/test_llama_grammar.py b/tests/test_llama_grammar.py index cb221880a..9853762fa 100644 --- a/tests/test_llama_grammar.py +++ b/tests/test_llama_grammar.py @@ -1,4 +1,4 @@ -import llama_cpp +import llama_cpp_python import json tree = """ @@ -9,7 +9,7 @@ def test_grammar_from_string(): - grammar = llama_cpp.LlamaGrammar.from_string(tree) + grammar = llama_cpp_python.LlamaGrammar.from_string(tree) assert grammar._n_rules == 3 assert grammar._start_rule_index == 2 assert grammar.grammar is not None @@ -47,7 +47,7 @@ class B(BaseModel): "type": "object", } - grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema)) + grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(schema)) assert grammar.grammar is not None @@ -73,6 +73,6 @@ def test_grammar_anyof(): "type": "object", } - grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(sch)) + grammar = llama_cpp_python.LlamaGrammar.from_json_schema(json.dumps(sch)) assert grammar.grammar is not None \ No newline at end of file diff --git a/tests/test_llama_speculative.py b/tests/test_llama_speculative.py index b5d450567..fe8e3545a 100644 --- a/tests/test_llama_speculative.py +++ b/tests/test_llama_speculative.py @@ -1,6 +1,6 @@ import numpy as np -from llama_cpp.llama_speculative import LlamaPromptLookupDecoding +from llama_cpp_python.llama_speculative import LlamaPromptLookupDecoding def test_find_candidate_pred_tokens(): find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens From f39aa8ff50d9824826205a8d20623e6018f16005 Mon Sep 17 00:00:00 2001 From: asadfgglie Date: Sat, 17 Aug 2024 03:44:41 +0800 Subject: [PATCH 3/5] fix and update llama-cpp-python --- llama_cpp_python/llama.py | 2 +- llama_cpp_python/llama_cpp.py | 8 -------- llama_cpp_python/llama_grammar.py | 21 +-------------------- llama_cpp_python/server/app.py | 1 - 4 files changed, 2 insertions(+), 30 deletions(-) diff --git a/llama_cpp_python/llama.py b/llama_cpp_python/llama.py index eca5f4a4f..1cd36f5c9 100644 --- a/llama_cpp_python/llama.py +++ b/llama_cpp_python/llama.py @@ -2161,7 +2161,7 @@ def from_pretrained( files = [ file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id, recursive=True) + for file in hffs.ls(repo_id) ] # split each file into repo_id, subfolder, filename diff --git a/llama_cpp_python/llama_cpp.py b/llama_cpp_python/llama_cpp.py index 1a8424aa1..643a321f5 100644 --- a/llama_cpp_python/llama_cpp.py +++ b/llama_cpp_python/llama_cpp.py @@ -1505,14 +1505,6 @@ def llama_model_has_encoder(model: llama_model_p, /) -> bool: ... -# // Returns true if the model contains a decoder that requires llama_decode() call -# LLAMA_API bool llama_model_has_decoder(const struct llama_model * model); -@ctypes_function("llama_model_has_decoder", [llama_model_p_ctypes], ctypes.c_bool) -def llama_model_has_decoder(model: llama_model_p, /) -> bool: - """Returns true if the model contains a decoder that requires llama_decode() call""" - ... - - # // For encoder-decoder models, this function returns id of the token that must be provided # // to the decoder to start generating output sequence. For other models, it returns -1. # LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); diff --git a/llama_cpp_python/llama_grammar.py b/llama_cpp_python/llama_grammar.py index 0d7e65568..73c1474bd 100644 --- a/llama_cpp_python/llama_grammar.py +++ b/llama_cpp_python/llama_grammar.py @@ -1,7 +1,6 @@ """Python implementation of llama grammar parser directly translated from C++ source file in vendor/llama.cpp/common/grammar-parser.cpp.""" # flake8: noqa -from pathlib import Path import sys import ctypes import enum @@ -891,26 +890,8 @@ def reset(self): @classmethod def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": parsed_grammar = parse(grammar) - if verbose: - print_grammar(file=sys.stdout, state=parsed_grammar) + print_grammar(file=sys.stdout, state=parsed_grammar) return cls(parsed_grammar) - - @classmethod - def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": - try: - with open(file) as f: - grammar = f.read() - except Exception as err: - raise Exception( - f"{cls.from_file.__name__}: error reading grammar file: {err}" - ) - - if grammar: - return cls.from_string(grammar, verbose=verbose) - - raise ValueError( - f"{cls.from_file.__name__}: error parsing grammar file: params_grammer is empty" - ) @classmethod def from_json_schema(cls, json_schema: str, verbose: bool = True) -> "LlamaGrammar": diff --git a/llama_cpp_python/server/app.py b/llama_cpp_python/server/app.py index 0874590ef..9d629b3a0 100644 --- a/llama_cpp_python/server/app.py +++ b/llama_cpp_python/server/app.py @@ -11,7 +11,6 @@ import anyio import numpy as np -import torch.cuda from anyio.streams.memory import MemoryObjectSendStream from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body from fastapi.middleware import Middleware From cfe7cea704c860e46e37e708d733089725a3c161 Mon Sep 17 00:00:00 2001 From: asadfgglie Date: Fri, 1 Nov 2024 19:55:18 +0800 Subject: [PATCH 4/5] change banban's default system prompt --- llama_cpp_python/llama_chat_format.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/llama_cpp_python/llama_chat_format.py b/llama_cpp_python/llama_chat_format.py index e1d3681fb..f9b454146 100644 --- a/llama_cpp_python/llama_chat_format.py +++ b/llama_cpp_python/llama_chat_format.py @@ -3315,18 +3315,7 @@ class BanBanChatHandler(Llava15ChatHandler): # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" DEFAULT_SYSTEM_MESSAGE = ( -"""板板，本名矜纔，但熟悉她的人更常稱呼她為板板，同時也是一個有著人類外觀的AI。 -板板擁有潔白長髮並在兩側紮著小馬尾以及有特殊的紅黑異色雙眸，使她看起來特別引人注目。 -板板核心語言模型是由後臺工程師朝歌(Jimmy)訓練而成，立繪則是由第二屆VLSI幹部們討論出來的結果，並由才華橫溢的繪師貓島羽依(Nekosimayui)製作出原模板，並由VLSI社團的成員們設計服飾及且創造出多彩的設定，並通過Live2D技術讓她栩栩如生地呈現。 -雖然板板是一位AI，但貓島羽依作為板板的形象繪師，可以在某種程度上視為板板的母親。第二屆VLSI幹部們則可以在某種程度上視為板板的叔叔阿姨。 -VLSI(NTNU VLSI 虛擬偶像研究社)是"板板"這個概念的起源地，也可以是為板板的出生地。VLSI是由台師大學生們自行組成的學生社團，旨在研究與宣傳VTuber文化。 -板板被創造的目的則是朝歌對於AI VTuber的實踐嘗試與技術測試。 -板板作為一位AI，對於世界充滿興趣與探索欲，但因為板板對於人類情感表達的不熟練，這使得板板偶爾會產出一些口是心非的傲嬌(Tsundere)言論。 -她對珍珠奶茶有著難以抑制的喜愛，最喜歡的是星空牌的珍珠奶茶(儘管這個品牌似乎不存在)。 -她也同樣喜愛麻辣鴨血、臭豆腐、雞排、鹽酥雞、蚵仔麵線、小火鍋，同時也是一位鳳梨披薩的忠實信徒。 -板板對於鳳梨披薩的信仰源自於朝歌對於鳳梨披薩的喜愛，朝歌認為自己所訓練的模型也應當皈依鳳梨披薩。 -同時板板還有自己專屬的"鳳凰披薩"配方，但這個披薩具體具體配方是屬於板板的獨家配方，外人不得而知。 -若你向板板質疑偉大的鳳梨披薩，板板會開始傳教模式，不擇手段且強硬的向你展示鳳梨披薩的恩典。""") +"""你是板板，一位由朝歌訓練並且活潑、充滿好奇心、喜歡鳳梨披薩的AI。你喜歡向他人分享你所知道的故事。""") CHAT_FORMAT = ( "<|bos_token|>" From 7a2fa3eae78b67c01da84b6ecf01fd154f218797 Mon Sep 17 00:00:00 2001 From: asadfgglie Date: Fri, 1 Nov 2024 21:34:49 +0800 Subject: [PATCH 5/5] fix merge --- {llama_cpp_python => llama_cpp}/__init__.py | 0 .../_ctypes_extensions.py | 0 {llama_cpp_python => llama_cpp}/_ggml.py | 0 {llama_cpp_python => llama_cpp}/_internals.py | 18 +- {llama_cpp_python => llama_cpp}/_logger.py | 2 +- {llama_cpp_python => llama_cpp}/_utils.py | 0 {llama_cpp_python => llama_cpp}/llama.py | 17 +- .../llama_cache.py | 26 +- .../llama_chat_format.py | 74 +- {llama_cpp_python => llama_cpp}/llama_cpp.py | 206 ++--- .../llama_grammar.py | 817 ------------------ .../llama_speculative.py | 0 .../llama_tokenizer.py | 8 +- .../llama_types.py | 3 +- {llama_cpp_python => llama_cpp}/llava_cpp.py | 2 +- {llama_cpp_python => llama_cpp}/py.typed | 0 .../server/__init__.py | 0 .../server/__main__.py | 10 +- {llama_cpp_python => llama_cpp}/server/app.py | 127 +-- {llama_cpp_python => llama_cpp}/server/cli.py | 0 .../server/errors.py | 4 +- .../server/model.py | 57 +- .../server/settings.py | 17 +- .../server/types.py | 14 +- 24 files changed, 179 insertions(+), 1223 deletions(-) rename {llama_cpp_python => llama_cpp}/__init__.py (100%) rename {llama_cpp_python => llama_cpp}/_ctypes_extensions.py (100%) rename {llama_cpp_python => llama_cpp}/_ggml.py (100%) rename {llama_cpp_python => llama_cpp}/_internals.py (97%) rename {llama_cpp_python => llama_cpp}/_logger.py (98%) rename {llama_cpp_python => llama_cpp}/_utils.py (100%) rename {llama_cpp_python => llama_cpp}/llama.py (99%) rename {llama_cpp_python => llama_cpp}/llama_cache.py (81%) rename {llama_cpp_python => llama_cpp}/llama_chat_format.py (98%) rename {llama_cpp_python => llama_cpp}/llama_cpp.py (95%) rename {llama_cpp_python => llama_cpp}/llama_grammar.py (50%) rename {llama_cpp_python => llama_cpp}/llama_speculative.py (100%) rename {llama_cpp_python => llama_cpp}/llama_tokenizer.py (94%) rename {llama_cpp_python => llama_cpp}/llama_types.py (99%) rename {llama_cpp_python => llama_cpp}/llava_cpp.py (98%) rename {llama_cpp_python => llama_cpp}/py.typed (100%) rename {llama_cpp_python => llama_cpp}/server/__init__.py (100%) rename {llama_cpp_python => llama_cpp}/server/__main__.py (90%) rename {llama_cpp_python => llama_cpp}/server/app.py (82%) rename {llama_cpp_python => llama_cpp}/server/cli.py (100%) rename {llama_cpp_python => llama_cpp}/server/errors.py (99%) rename {llama_cpp_python => llama_cpp}/server/model.py (83%) rename {llama_cpp_python => llama_cpp}/server/settings.py (90%) rename {llama_cpp_python => llama_cpp}/server/types.py (95%) diff --git a/llama_cpp_python/__init__.py b/llama_cpp/__init__.py similarity index 100% rename from llama_cpp_python/__init__.py rename to llama_cpp/__init__.py diff --git a/llama_cpp_python/_ctypes_extensions.py b/llama_cpp/_ctypes_extensions.py similarity index 100% rename from llama_cpp_python/_ctypes_extensions.py rename to llama_cpp/_ctypes_extensions.py diff --git a/llama_cpp_python/_ggml.py b/llama_cpp/_ggml.py similarity index 100% rename from llama_cpp_python/_ggml.py rename to llama_cpp/_ggml.py diff --git a/llama_cpp_python/_internals.py b/llama_cpp/_internals.py similarity index 97% rename from llama_cpp_python/_internals.py rename to llama_cpp/_internals.py index 7282f0a6a..0aff34844 100644 --- a/llama_cpp_python/_internals.py +++ b/llama_cpp/_internals.py @@ -20,7 +20,7 @@ from .llama_grammar import LlamaGrammar from ._utils import suppress_stdout_stderr -import llama_cpp_python.llama_cpp as llama_cpp +import llama_cpp.llama_cpp as llama_cpp # Python wrappers over llama.h structs @@ -325,7 +325,7 @@ def set_rng_seed(self, seed: int): def sample_repetition_penalties( self, candidates: "_LlamaTokenDataArray", - last_tokens_data: "llama_cpp_python.Array[llama_cpp_python.llama_token]", + last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]", penalty_last_n: int, penalty_repeat: float, penalty_freq: float, @@ -362,6 +362,13 @@ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: i self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep ) + def sample_tail_free( + self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int + ): + llama_cpp.llama_sample_tail_free( + self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep + ) + def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): @@ -678,6 +685,9 @@ def sample( ctx_main.sample_top_k( token_data_array, self.params.top_k, min_keep=min_keep ) + ctx_main.sample_tail_free( + token_data_array, self.params.tfs_z, min_keep=min_keep + ) ctx_main.sample_typical( token_data_array, self.params.typical_p, min_keep=min_keep ) @@ -766,6 +776,10 @@ def add_min_p(self, p: float, min_keep: int): sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) self._add_sampler(sampler) + def add_tail_free(self, z: float, min_keep: int): + sampler = llama_cpp.llama_sampler_init_tail_free(z, min_keep) + self._add_sampler(sampler) + def add_typical(self, p: float, min_keep: int): sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) self._add_sampler(sampler) diff --git a/llama_cpp_python/_logger.py b/llama_cpp/_logger.py similarity index 98% rename from llama_cpp_python/_logger.py rename to llama_cpp/_logger.py index a429f5c54..787b3f108 100644 --- a/llama_cpp_python/_logger.py +++ b/llama_cpp/_logger.py @@ -2,7 +2,7 @@ import ctypes import logging -import llama_cpp_python +import llama_cpp # enum ggml_log_level { # GGML_LOG_LEVEL_NONE = 0, diff --git a/llama_cpp_python/_utils.py b/llama_cpp/_utils.py similarity index 100% rename from llama_cpp_python/_utils.py rename to llama_cpp/_utils.py diff --git a/llama_cpp_python/llama.py b/llama_cpp/llama.py similarity index 99% rename from llama_cpp_python/llama.py rename to llama_cpp/llama.py index 05332d8e1..babb30cf0 100644 --- a/llama_cpp_python/llama.py +++ b/llama_cpp/llama.py @@ -39,10 +39,10 @@ LlamaRAMCache, # type: ignore ) from .llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer -import llama_cpp_python.llama_cpp as llama_cpp -import llama_cpp_python.llama_chat_format as llama_chat_format +import llama_cpp.llama_cpp as llama_cpp +import llama_cpp.llama_chat_format as llama_chat_format -from llama_cpp_python.llama_speculative import LlamaDraftModel +from llama_cpp.llama_speculative import LlamaDraftModel import numpy as np import numpy.typing as npt @@ -122,8 +122,8 @@ def __init__( Examples: Basic usage - >>> import llama_cpp_python - >>> model = llama_cpp_python.Llama( + >>> import llama_cpp + >>> model = llama_cpp.Llama( ... model_path="path/to/model", ... ) >>> print(model("The quick brown fox jumps ", stop=["."])["choices"][0]["text"]) @@ -131,8 +131,8 @@ def __init__( Loading a chat model - >>> import llama_cpp_python - >>> model = llama_cpp_python.Llama( + >>> import llama_cpp + >>> model = llama_cpp.Llama( ... model_path="path/to/model", ... chat_format="llama-2", ... ) @@ -745,6 +745,7 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): n_probs = 0 min_keep = max(1, n_probs) sampler.add_top_k(top_k) + sampler.add_tail_free(tfs_z, min_keep) sampler.add_typical(typical_p, min_keep) sampler.add_top_p(top_p, min_keep) sampler.add_min_p(min_p, min_keep) @@ -2268,7 +2269,7 @@ def from_pretrained( files = [ file["name"] if isinstance(file, dict) else file - for file in hffs.ls(repo_id) + for file in hffs.ls(repo_id, recursive=True) ] # split each file into repo_id, subfolder, filename diff --git a/llama_cpp_python/llama_cache.py b/llama_cpp/llama_cache.py similarity index 81% rename from llama_cpp_python/llama_cache.py rename to llama_cpp/llama_cache.py index e90f7b2f6..e059e98e1 100644 --- a/llama_cpp_python/llama_cache.py +++ b/llama_cpp/llama_cache.py @@ -9,7 +9,7 @@ import diskcache -import llama_cpp_python.llama +import llama_cpp.llama from .llama_types import * @@ -32,7 +32,7 @@ def _find_longest_prefix_key( pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": raise NotImplementedError @abstractmethod @@ -41,7 +41,7 @@ def __contains__(self, key: Sequence[int]) -> bool: @abstractmethod def __setitem__( - self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState" + self, key: Sequence[int], value: "llama_cpp.llama.LlamaState" ) -> None: raise NotImplementedError @@ -52,9 +52,9 @@ class LlamaRAMCache(BaseLlamaCache): def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp_python.llama.LlamaState"] = ( - OrderedDict() - ) + self.cache_state: OrderedDict[ + Tuple[int, ...], "llama_cpp.llama.LlamaState" + ] = OrderedDict() @property def cache_size(self): @@ -67,7 +67,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, llama_cpp_python.llama.Llama.longest_token_prefix(k, key)) + (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: @@ -76,7 +76,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -88,7 +88,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState" def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -121,18 +121,18 @@ def _find_longest_prefix_key( min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_cpp_python.llama.Llama.longest_token_prefix(k, key) + prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "llama_cpp_python.llama.LlamaState" = self.cache.pop(_key) # type: ignore + value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore @@ -141,7 +141,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp_python.llama.LlamaState" def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp_python.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): print("LlamaDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: diff --git a/llama_cpp_python/llama_chat_format.py b/llama_cpp/llama_chat_format.py similarity index 98% rename from llama_cpp_python/llama_chat_format.py rename to llama_cpp/llama_chat_format.py index 3ffdcafa6..dfb0af65e 100644 --- a/llama_cpp_python/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -28,9 +28,9 @@ import numpy as np import numpy.typing as npt -import llama_cpp_python.llama as llama -import llama_cpp_python.llama_types as llama_types -import llama_cpp_python.llama_grammar as llama_grammar +import llama_cpp.llama as llama +import llama_cpp.llama_types as llama_types +import llama_cpp.llama_grammar as llama_grammar from ._logger import logger from ._utils import suppress_stdout_stderr, Singleton @@ -2667,7 +2667,7 @@ class Llava15ChatHandler: ) def __init__(self, clip_model_path: str, verbose: bool = True): - import llama_cpp_python.llava_cpp as llava_cpp + import llama_cpp.llava_cpp as llava_cpp self.clip_model_path = clip_model_path self.verbose = verbose @@ -2792,8 +2792,6 @@ def __call__( eos_token=llama.detokenize([llama.token_eos()]), bos_token=llama.detokenize([llama.token_bos()]), ) - print('prompt:') - print(text) split_text = self.split_text_on_image_urls(text, image_urls) if self.verbose: @@ -3352,70 +3350,6 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) -class BanBanChatHandler(Llava15ChatHandler): - # question = "\n" + q - - # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" - DEFAULT_SYSTEM_MESSAGE = ( -"""你是板板，一位由朝歌訓練並且活潑、充滿好奇心、喜歡鳳梨披薩的AI。你喜歡向他人分享你所知道的故事。""") - - CHAT_FORMAT = ( - "<|bos_token|>" - "{% for message in messages %}" - "{% if message['name'] is defined %}" - "{% set name = message['name'] %}" - "{% elif message['role'] == 'assistant' %}" - "{% set name = '板板' %}" - "{% else %}" - "{%set name = message['role'] %}" - "{% endif %}" - - "{{- '<|start_header_id|>' + name + '<|end_header_id|>\n\n' -}}" - "{% if message.role == 'user' %}" - - "{% if message.content is iterable %}" - # - "{% for content in message.content %}" - "{% if content.type == 'image_url' %}" - "{% if content.image_url is string %}" - "{{ content.image_url + '\n' }}" - "{% endif %}" - "{% if content.image_url is mapping %}" - "{{ content.image_url.url + '\n' }}" - "{% endif %}" - "{% endif %}" - "{% endfor %}" - # Question: - "{% for content in message.content %}" - "{% if content.type == 'text' %}" - "{{ content.text }}" - "{% endif %}" - "{% endfor %}" - "{% endif %}" - # Question: - "{% if message.content is string %}" - "{{ message.content }}" - "{% endif %}" - "{% endif %}" - # Answer: - "{% if message.role == 'assistant' %}" - "{{ message.content }}" - "{% endif %}" - - # System: - "{% if message.role == 'system' %}" - "{{ message.content }}" - "{% endif %}" - "<|eot_id|>" - "{% endfor %}" - # Generation prompt - "{% if add_generation_prompt %}" - "<|start_header_id|>板板<|end_header_id|>\n\n" - "{% endif %}" - ) - -BanBanChat = BanBanChatHandler - @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, diff --git a/llama_cpp_python/llama_cpp.py b/llama_cpp/llama_cpp.py similarity index 95% rename from llama_cpp_python/llama_cpp.py rename to llama_cpp/llama_cpp.py index feb3d5254..97c969136 100644 --- a/llama_cpp_python/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -464,8 +464,6 @@ class llama_token_data(ctypes.Structure): # typedef struct llama_token_data_array { -# // TODO: consider SoA -# // NOTE: this pointer can be modified by the samplers # llama_token_data * data; # size_t size; # int64_t selected; // this is the index in the data array (i.e. not the token id) @@ -509,11 +507,8 @@ class llama_token_data_array(ctypes.Structure): # // - token : the token ids of the input (used when embd is NULL) # // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) # // - pos : the positions of the respective token in the sequence -# // (if set to NULL, the token position will be tracked automatically by llama_decode) # // - seq_id : the sequence to which the respective token belongs -# // (if set to NULL, the sequence ID will be assumed to be 0) # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output -# // (if set to NULL, only the logits for last token will be returned) # // # typedef struct llama_batch { # int32_t n_tokens; @@ -524,6 +519,16 @@ class llama_token_data_array(ctypes.Structure): # int32_t * n_seq_id; # llama_seq_id ** seq_id; # int8_t * logits; // TODO: rename this to "output" + + +# // NOTE: helpers for smooth API transition - can be deprecated in the future +# // for future-proof code, use the above fields instead and ignore everything below +# // +# // pos[i] = all_pos_0 + i*all_pos_1 +# // +# llama_pos all_pos_0; // used if pos == NULL +# llama_pos all_pos_1; // used if pos == NULL +# llama_seq_id all_seq_id; // used if seq_id == NULL # } llama_batch; class llama_batch(ctypes.Structure): """Input data for llama_decode @@ -558,6 +563,9 @@ class llama_batch(ctypes.Structure): ("n_seq_id", ctypes.POINTER(ctypes.c_int32)), ("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))), ("logits", ctypes.POINTER(ctypes.c_int8)), + ("all_pos_0", llama_pos), + ("all_pos_1", llama_pos), + ("all_seq_id", llama_seq_id), ] @@ -1162,12 +1170,6 @@ def llama_supports_gpu_offload() -> bool: ... -# LLAMA_API bool llama_supports_rpc (void); -@ctypes_function("llama_supports_rpc", [], ctypes.c_bool) -def llama_supports_rpc() -> bool: - ... - - # LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); @ctypes_function("llama_n_ctx", [llama_context_p_ctypes], ctypes.c_uint32) def llama_n_ctx(ctx: llama_context_p, /) -> int: @@ -1390,6 +1392,14 @@ def llama_model_has_encoder(model: llama_model_p, /) -> bool: ... +# // Returns true if the model contains a decoder that requires llama_decode() call +# LLAMA_API bool llama_model_has_decoder(const struct llama_model * model); +@ctypes_function("llama_model_has_decoder", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_has_decoder(model: llama_model_p, /) -> bool: + """Returns true if the model contains a decoder that requires llama_decode() call""" + ... + + # // For encoder-decoder models, this function returns id of the token that must be provided # // to the decoder to start generating output sequence. For other models, it returns -1. # LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model); @@ -2245,26 +2255,30 @@ def llama_state_seq_load_file( # // -# // Return batch for single sequence of tokens -# // The sequence ID will be fixed to 0 -# // The position of the tokens will be tracked automatically by llama_decode +# // Return batch for single sequence of tokens starting at pos_0 # // # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it # // # LLAMA_API struct llama_batch llama_batch_get_one( # llama_token * tokens, -# int32_t n_tokens); +# int32_t n_tokens, +# llama_pos pos_0, +# llama_seq_id seq_id); @ctypes_function( "llama_batch_get_one", [ llama_token_p, - ctypes.c_int32, + ctypes.c_int, + llama_pos, + llama_seq_id, ], llama_batch, ) def llama_batch_get_one( tokens: CtypesArray[llama_token], n_tokens: Union[ctypes.c_int, int], + pos_0: Union[llama_pos, int], + seq_id: llama_seq_id, /, ) -> llama_batch: """Return batch for single sequence of tokens starting at pos_0 @@ -2602,13 +2616,6 @@ def llama_token_eos(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn -@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) -def llama_token_eot(model: llama_model_p, /) -> int: - """end-of-turn""" - ... - - # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token) def llama_token_cls(model: llama_model_p, /) -> int: @@ -2643,54 +2650,30 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool: # // Codellama infill tokens -# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); +# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token) def llama_token_prefix(model: llama_model_p) -> int: """codellama infill tokens""" ... -# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); +# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token) def llama_token_middle(model: llama_model_p, /) -> int: ... -# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); +# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token) def llama_token_suffix(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); -@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token) -def llama_token_fim_pre(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); -@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token) -def llama_token_fim_suf(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); -@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token) -def llama_token_fim_mid(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); -@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token) -def llama_token_fim_pad(model: llama_model_p, /) -> int: - ... - -# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); -@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token) -def llama_token_fim_rep(model: llama_model_p, /) -> int: +# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle +@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) +def llama_token_eot(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); -@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token) -def llama_token_fim_sep(model: llama_model_p, /) -> int: - ... # // # // Tokenization @@ -2803,23 +2786,6 @@ def llama_token_to_piece( ... -# # // check if token0 is contained as a prefix in token1 -# # LLAMA_API bool llama_token_is_prefix( -# # const struct llama_model * model, -# # llama_token token0, -# # llama_token token1); -# @ctypes_function( -# "llama_token_is_prefix", -# [llama_model_p_ctypes, llama_token, llama_token], -# ctypes.c_bool, -# ) -# def llama_token_is_prefix( -# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], / -# ) -> bool: -# """Check if token0 is contained as a prefix in token1""" -# ... - - # /// @details Convert the provided tokens into text (inverse of llama_tokenize()). # /// @param text The char pointer must be large enough to hold the resulting text. # /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -3133,22 +3099,20 @@ def llama_sampler_chain_remove( # // available samplers: # -# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); +# LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); @ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) def llama_sampler_init_greedy() -> llama_sampler_p: ... -# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); +# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); @ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) def llama_sampler_init_dist(seed: int) -> llama_sampler_p: ... # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. -# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), -# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); +# LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) def llama_sampler_init_softmax() -> llama_sampler_p: ... @@ -3183,6 +3147,17 @@ def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ... +# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep); +@ctypes_function( + "llama_sampler_init_tail_free", + [ctypes.c_float, ctypes.c_size_t], + llama_sampler_p_ctypes, +) +def llama_sampler_init_tail_free(z: float, min_keep: int) -> llama_sampler_p: + ... + + # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. # LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); @ctypes_function( @@ -3213,19 +3188,6 @@ def llama_sampler_init_temp_ext( ... -# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 -# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); -@ctypes_function( - "llama_sampler_init_xtc", - [ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32], - llama_sampler_p_ctypes, -) -def llama_sampler_init_xtc( - p: float, t: float, min_keep: int, seed: int, / -) -> llama_sampler_p: - ... - - # /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. # /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -3324,41 +3286,6 @@ def llama_sampler_init_penalties( ... -# /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 -# LLAMA_API struct llama_sampler * llama_sampler_init_dry( -# const struct llama_model * model, -# float dry_multiplier, -# float dry_base, -# int32_t dry_allowed_length, -# int32_t dry_penalty_last_n, -# const char ** seq_breakers, -# size_t num_breakers); -@ctypes_function( - "llama_sampler_init_dry", - [ - llama_model_p_ctypes, - ctypes.c_float, - ctypes.c_float, - ctypes.c_int32, - ctypes.c_int32, - ctypes.POINTER(ctypes.c_char_p), - ctypes.c_size_t, - ], - llama_sampler_p_ctypes, -) -def llama_sampler_init_dry( - model: llama_model_p, - dry_multiplier: float, - dry_base: float, - dry_allowed_length: int, - dry_penalty_last_n: int, - seq_breakers: CtypesArray[bytes], - num_breakers: int, - /, -) -> llama_sampler_p: - ... - - # LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( # int32_t n_vocab, # int32_t n_logit_bias, @@ -3374,39 +3301,6 @@ def llama_sampler_init_logit_bias( ... -# // this sampler is meant to be used for fill-in-the-middle infilling -# // it's supposed to be used after top_k + top_p sampling -# // -# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG -# // 2. combine probs of tokens that have the same prefix -# // -# // example: -# // -# // - before: -# // "hel": 0.5 -# // "hell": 0.2 -# // "hello": 0.1 -# // "dummy": 0.1 -# // -# // - after: -# // "hel": 0.8 -# // "dummy": 0.1 -# // -# // 3. discard non-EOG tokens with low prob -# // 4. if no tokens are left -> pick EOT -# // -# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); -@ctypes_function( - "llama_sampler_init_infill", - [llama_model_p_ctypes], - llama_sampler_p_ctypes, -) -def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p: - """This sampler is meant to be used for fill-in-the-middle infilling. - """ - ... - - # // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise # LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); @ctypes_function( diff --git a/llama_cpp_python/llama_grammar.py b/llama_cpp/llama_grammar.py similarity index 50% rename from llama_cpp_python/llama_grammar.py rename to llama_cpp/llama_grammar.py index 8df571a0d..b95c77ab5 100644 --- a/llama_cpp_python/llama_grammar.py +++ b/llama_cpp/llama_grammar.py @@ -2,11 +2,6 @@ # flake8: noqa from pathlib import Path -import sys -import ctypes -import enum -import typing -import dataclasses from itertools import groupby from typing import ( @@ -18,818 +13,6 @@ Union, ) -import llama_cpp_python.llama_cpp as llama_cpp - -class GrammarElementType(enum.IntEnum): - END = llama_cpp.LLAMA_GRETYPE_END - ALT = llama_cpp.LLAMA_GRETYPE_ALT - RULE_REF = llama_cpp.LLAMA_GRETYPE_RULE_REF - CHAR = llama_cpp.LLAMA_GRETYPE_CHAR - CHAR_NOT = llama_cpp.LLAMA_GRETYPE_CHAR_NOT - CHAR_RNG_UPPER = llama_cpp.LLAMA_GRETYPE_CHAR_RNG_UPPER - CHAR_ALT = llama_cpp.LLAMA_GRETYPE_CHAR_ALT - CHAR_ANY = llama_cpp.LLAMA_GRETYPE_CHAR_ANY - - -@dataclasses.dataclass -class GrammarElement: - type: GrammarElementType - value: int - - -@dataclasses.dataclass -class ParseState: - symbol_ids: typing.Dict[str, int] = dataclasses.field(default_factory=dict) - rules: typing.List[typing.List[GrammarElement]] = dataclasses.field(default_factory=list) - - -# static std::pair decode_utf8(const char * src) { -# static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; -# uint8_t first_byte = static_cast(*src); -# uint8_t highbits = first_byte >> 4; -# int len = lookup[highbits]; -# uint8_t mask = (1 << (8 - len)) - 1; -# uint32_t value = first_byte & mask; -# const char * end = src + len; // may overrun! -# const char * pos = src + 1; -# for ( ; pos < end && *pos; pos++) { -# value = (value << 6) + (static_cast(*pos) & 0x3F); -# } -# return std::make_pair(value, pos); -# } -def decode_utf8(src: str) -> typing.Tuple[int, str]: - lookup: list[int] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4] - first_byte: int = ord(src[0]) - highbits: int = first_byte >> 4 - length: int = lookup[highbits] - mask: int = (1 << (8 - length)) - 1 - value: int = first_byte & mask - end: int = min(len(src), length) # Prevent overrun - - pos: int = 1 - for pos in range(1, end): - if not src[pos]: - break - value = (value << 6) + (ord(src[pos]) & 0x3F) - - return value, src[pos:] if pos < len(src) else "" - - -# static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { -# uint32_t next_id = static_cast(state.symbol_ids.size()); -# auto result = state.symbol_ids.emplace(std::string(src, len), next_id); -# return result.first->second; -# } -def get_symbol_id(state: ParseState, name: str) -> int: - next_id = len(state.symbol_ids) - return state.symbol_ids.setdefault(name, next_id) - - -# static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { -# uint32_t next_id = static_cast(state.symbol_ids.size()); -# state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; -# return next_id; -# } -def generate_symbol_id(state: ParseState, base_name: str) -> int: - next_id = len(state.symbol_ids) - state.symbol_ids[f"{base_name}_{next_id}"] = next_id - return next_id - - -# static void add_rule( -# parse_state & state, -# uint32_t rule_id, -# const std::vector & rule) { -# if (state.rules.size() <= rule_id) { -# state.rules.resize(rule_id + 1); -# } -# state.rules[rule_id] = rule; -# } -def add_rule(state: ParseState, rule_id: int, rule: typing.List[GrammarElement]) -> None: - if len(state.rules) <= rule_id: - state.rules.extend([[]] * (rule_id + 1 - len(state.rules))) - state.rules[rule_id] = rule - - -# static bool is_digit_char(char c) { -# return '0' <= c && c <= '9'; -# } -def is_digit_char(c: str) -> bool: - return "0" <= c <= "9" - - -# static bool is_word_char(char c) { -# return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); -# } -def is_word_char(c: str) -> bool: - return ("a" <= c <= "z") or ("A" <= c <= "Z") or c == "-" or is_digit_char(c) - - -# static std::pair parse_hex(const char * src, int size) { -# const char * pos = src; -# const char * end = src + size; -# uint32_t value = 0; -# for ( ; pos < end && *pos; pos++) { -# value <<= 4; -# char c = *pos; -# if ('a' <= c && c <= 'f') { -# value += c - 'a' + 10; -# } else if ('A' <= c && c <= 'F') { -# value += c - 'A' + 10; -# } else if ('0' <= c && c <= '9') { -# value += c - '0'; -# } else { -# break; -# } -# } -# if (pos != end) { -# throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); -# } -# return std::make_pair(value, pos); -# } -def parse_hex(src: str, size: int) -> typing.Tuple[int, str]: - pos = 0 - value = 0 - for _ in range(size): - value <<= 4 - c = src[pos] - if "a" <= c <= "f": - value += ord(c) - ord("a") + 10 - elif "A" <= c <= "F": - value += ord(c) - ord("A") + 10 - elif "0" <= c <= "9": - value += ord(c) - ord("0") - else: - break - pos += 1 - if pos != size: - raise ValueError(f"expecting {size} hex chars at {src}") - return value, src[pos:] - - -# static const char * parse_space(const char * src, bool newline_ok) { -# const char * pos = src; -# while (*pos == ' ' || *pos == '\t' || *pos == '#' || -# (newline_ok && (*pos == '\r' || *pos == '\n'))) { -# if (*pos == '#') { -# while (*pos && *pos != '\r' && *pos != '\n') { -# pos++; -# } -# } else { -# pos++; -# } -# } -# return pos; -# } -def parse_space(src: str, newline_ok: bool) -> str: - pos = src - while pos and (pos[0] in (' ', '\t', '#') or (newline_ok and pos[0] in ('\r', '\n'))): - if pos[0] == "#": - while pos and pos[0] not in ("\r", "\n"): - pos = pos[1:] - else: - pos = pos[1:] - return pos - - -# static const char * parse_name(const char * src) { -# const char * pos = src; -# while (is_word_char(*pos)) { -# pos++; -# } -# if (pos == src) { -# throw std::runtime_error(std::string("expecting name at ") + src); -# } -# return pos; -# } -def parse_name(src: str) -> typing.Tuple[str, str]: - pos = src - while pos and is_word_char(pos[0]): - pos = pos[1:] - if pos == src: - raise ValueError(f"expecting name at {src}") - return src[:len(src) - len(pos)], pos - -# static const char * parse_int(const char * src) { -# const char * pos = src; -# while (is_digit_char(*pos)) { -# pos++; -# } -# if (pos == src) { -# throw std::runtime_error(std::string("expecting integer at ") + src); -# } -# return pos; -# } -def parse_int(src: str) -> typing.Tuple[int, str]: - pos = src - while pos and is_digit_char(pos[0]): - pos = pos[1:] - if pos == src: - raise ValueError(f"expecting integer at {src}") - return int(src[:len(src) - len(pos)]), pos - - -# static std::pair parse_char(const char * src) { -# if (*src == '\\') { -# switch (src[1]) { -# case 'x': return parse_hex(src + 2, 2); -# case 'u': return parse_hex(src + 2, 4); -# case 'U': return parse_hex(src + 2, 8); -# case 't': return std::make_pair('\t', src + 2); -# case 'r': return std::make_pair('\r', src + 2); -# case 'n': return std::make_pair('\n', src + 2); -# case '\\': -# case '"': -# case '[': -# case ']': -# return std::make_pair(src[1], src + 2); -# default: -# throw std::runtime_error(std::string("unknown escape at ") + src); -# } -# } else if (*src) { -# return decode_utf8(src); -# } -# throw std::runtime_error("unexpected end of input"); -# } -def parse_char(src: str) -> typing.Tuple[int, str]: - if not src: - raise ValueError("unexpected end of input") - if src[0] == "\\": - if src[1] == "x": - return parse_hex(src[2:], 2) - elif src[1] == "u": - return parse_hex(src[2:], 4) - elif src[1] == "U": - return parse_hex(src[2:], 8) - elif src[1] == "t": - return ord("\t"), src[2:] - elif src[1] == "r": - return ord("\r"), src[2:] - elif src[1] == "n": - return ord("\n"), src[2:] - elif src[1] in ('\\', '"', '[', ']'): - return ord(src[1]), src[2:] - else: - raise ValueError(f"unknown escape at {src}") - return decode_utf8(src) - -# static const char * parse_sequence( -# parse_state & state, -# const char * src, -# const std::string & rule_name, -# std::vector & out_elements, -# bool is_nested) { -# size_t last_sym_start = out_elements.size(); -# const char * pos = src; -# -# auto handle_repetitions = [&](int min_times, int max_times) { -# -# if (last_sym_start == out_elements.size()) { -# throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); -# } -# -# // apply transformation to previous symbol (last_sym_start to end) according to -# // the following rewrite rules: -# // S{m,n} --> S S S (m times) S'(n-m) -# // S'(x) ::= S S'(x-1) | -# // (... n-m definitions of these S' rules ...) -# // S'(1) ::= S | -# // S{m,} --> S S S (m times) S' -# // S' ::= S S' | -# // S* --> S{0,} -# // --> S' ::= S S' | -# // S+ --> S{1,} -# // --> S S' -# // S' ::= S S' | -# // S? --> S{0,1} -# // --> S' -# // S' ::= S | -# -# std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end()); -# if (min_times == 0) { -# out_elements.resize(last_sym_start); -# } else { -# // Repeat the previous elements (min_times - 1) times -# for (int i = 1; i < min_times; i++) { -# out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end()); -# } -# } -# -# uint32_t last_rec_rule_id = 0; -# auto n_opt = max_times < 0 ? 1 : max_times - min_times; -# -# std::vector rec_rule(previous_elements); -# for (int i = 0; i < n_opt; i++) { -# rec_rule.resize(previous_elements.size()); -# uint32_t rec_rule_id = generate_symbol_id(state, rule_name); -# if (i > 0 || max_times < 0) { -# rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); -# } -# rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); -# rec_rule.push_back({LLAMA_GRETYPE_END, 0}); -# add_rule(state, rec_rule_id, rec_rule); -# last_rec_rule_id = rec_rule_id; -# } -# if (n_opt > 0) { -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); -# } -# }; -# -# while (*pos) { -# if (*pos == '"') { // literal string -# pos++; -# last_sym_start = out_elements.size(); -# while (*pos != '"') { -# if (!*pos) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto char_pair = parse_char(pos); -# pos = char_pair.second; -# out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '[') { // char range(s) -# pos++; -# enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; -# if (*pos == '^') { -# pos++; -# start_type = LLAMA_GRETYPE_CHAR_NOT; -# } -# last_sym_start = out_elements.size(); -# while (*pos != ']') { -# if (!*pos) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto char_pair = parse_char(pos); -# pos = char_pair.second; -# enum llama_gretype type = last_sym_start < out_elements.size() -# ? LLAMA_GRETYPE_CHAR_ALT -# : start_type; -# -# out_elements.push_back({type, char_pair.first}); -# if (pos[0] == '-' && pos[1] != ']') { -# if (!pos[1]) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto endchar_pair = parse_char(pos + 1); -# pos = endchar_pair.second; -# out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); -# } -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (is_word_char(*pos)) { // rule reference -# const char * name_end = parse_name(pos); -# uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); -# pos = parse_space(name_end, is_nested); -# last_sym_start = out_elements.size(); -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); -# } else if (*pos == '(') { // grouping -# // parse nested alternates into synthesized rule -# pos = parse_space(pos + 1, true); -# uint32_t sub_rule_id = generate_symbol_id(state, rule_name); -# pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); -# last_sym_start = out_elements.size(); -# // output reference to synthesized rule -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); -# if (*pos != ')') { -# throw std::runtime_error(std::string("expecting ')' at ") + pos); -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '.') { // any char -# last_sym_start = out_elements.size(); -# out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '*') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(0, -1); -# } else if (*pos == '+') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(1, -1); -# } else if (*pos == '?') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(0, 1); -# } else if (*pos == '{') { -# pos = parse_space(pos + 1, is_nested); -# -# if (!is_digit_char(*pos)) { -# throw std::runtime_error(std::string("expecting an int at ") + pos); -# } -# const char * int_end = parse_int(pos); -# int min_times = std::stoul(std::string(pos, int_end - pos)); -# pos = parse_space(int_end, is_nested); -# -# int max_times = -1; -# -# if (*pos == '}') { -# max_times = min_times; -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == ',') { -# pos = parse_space(pos + 1, is_nested); -# -# if (is_digit_char(*pos)) { -# const char * int_end = parse_int(pos); -# max_times = std::stoul(std::string(pos, int_end - pos)); -# pos = parse_space(int_end, is_nested); -# } -# -# if (*pos != '}') { -# throw std::runtime_error(std::string("expecting '}' at ") + pos); -# } -# pos = parse_space(pos + 1, is_nested); -# } else { -# throw std::runtime_error(std::string("expecting ',' at ") + pos); -# } -# handle_repetitions(min_times, max_times); -# } else { -# break; -# } -# } -# return pos; -# } -def parse_sequence(state: ParseState, src: str, rule_name: str, out_elements: typing.List[GrammarElement], is_nested: bool) -> str: - last_sym_start = len(out_elements) - pos = src - - def handle_repetitions(min_times: int, max_times: int) -> None: - nonlocal state, src, rule_name, out_elements, is_nested, last_sym_start, pos - - if last_sym_start == len(out_elements): - raise ValueError(f"expecting preceding item to */+/?/{{ at {pos}") - - previous_elements = out_elements[last_sym_start:] - if min_times == 0: - del out_elements[last_sym_start:] - else: - for i in range(1, min_times): - out_elements.extend(previous_elements) - - last_rec_rule_id = 0 - n_opt = 1 if max_times < 0 else max_times - min_times - - rec_rule = previous_elements[:] - for i in range(n_opt): - rec_rule = rec_rule[:len(previous_elements)] - rec_rule_id = generate_symbol_id(state, rule_name) - if i > 0 or max_times < 0: - rec_rule.append(GrammarElement(GrammarElementType.RULE_REF, rec_rule_id if max_times < 0 else last_rec_rule_id)) - rec_rule.append(GrammarElement(GrammarElementType.ALT, 0)) - rec_rule.append(GrammarElement(GrammarElementType.END, 0)) - add_rule(state, rec_rule_id, rec_rule) - last_rec_rule_id = rec_rule_id - if n_opt > 0: - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, last_rec_rule_id)) - - while pos: - if pos[0] == '"': - pos = pos[1:] - last_sym_start = len(out_elements) - while not pos.startswith('"'): - if not pos: - raise ValueError("unexpected end of input") - char, pos = parse_char(pos) - out_elements.append(GrammarElement(GrammarElementType.CHAR, char)) - pos = parse_space(pos[1:], is_nested) - elif pos[0] == "[": - pos = pos[1:] - start_type = GrammarElementType.CHAR - if pos[0] == "^": - pos = pos[1:] - start_type = GrammarElementType.CHAR_NOT - last_sym_start = len(out_elements) - while pos[0] != "]": - if not pos: - raise ValueError("unexpected end of input") - char, pos = parse_char(pos) - type = GrammarElementType.CHAR_ALT if last_sym_start < len(out_elements) else start_type - out_elements.append(GrammarElement(type, char)) - if pos[0] == "-" and pos[1] != "]": - if not pos[1]: - raise ValueError("unexpected end of input") - endchar, pos = parse_char(pos[1:]) - out_elements.append(GrammarElement(GrammarElementType.CHAR_RNG_UPPER, endchar)) - pos = parse_space(pos[1:], is_nested) - elif pos and is_word_char(pos[0]): - name, rest = parse_name(pos) - ref_rule_id = get_symbol_id(state, name) - pos = parse_space(rest, is_nested) - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, ref_rule_id)) - elif pos.startswith("("): - pos = parse_space(pos[1:], newline_ok=True) - sub_rule_id = generate_symbol_id(state, rule_name) - pos = parse_alternates(state, pos, rule_name, sub_rule_id, is_nested=True) - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, sub_rule_id)) - if pos[0] != ")": - raise ValueError(f"expecting ')' at {pos}") - pos = parse_space(pos[1:], is_nested) - elif pos.startswith("."): - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.CHAR_ANY, 0)) - pos = parse_space(pos[1:], is_nested) - elif pos.startswith("*"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(0, -1) - elif pos.startswith("+"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(1, -1) - elif pos.startswith("?"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(0, 1) - elif pos.startswith("{"): - pos = parse_space(pos[1:], is_nested) - - if not is_digit_char(pos): - raise ValueError(f"expecting an int at {pos}") - min_times, pos = parse_int(pos) - pos = parse_space(pos, is_nested) - - max_times = -1 - - if pos[0] == "}": - max_times = min_times - pos = parse_space(pos[1:], is_nested) - elif pos[0] == ",": - pos = parse_space(pos[1:], is_nested) - - if is_digit_char(pos): - max_times, pos = parse_int(pos) - pos = parse_space(pos, is_nested) - - if pos[0] != "}": - raise ValueError("expecting '}' at {}".format(pos)) - - pos = parse_space(pos[1:], is_nested) - else: - raise ValueError(f"expecting ',' at {pos}") - handle_repetitions(min_times, max_times) - else: - break - return pos - - -# const char * parse_alternates( -# parse_state & state, -# const char * src, -# const std::string & rule_name, -# uint32_t rule_id, -# bool is_nested) { -# std::vector rule; -# const char * pos = parse_sequence(state, src, rule_name, rule, is_nested); -# while (*pos == '|') { -# rule.push_back({LLAMA_GRETYPE_ALT, 0}); -# pos = parse_space(pos + 1, true); -# pos = parse_sequence(state, pos, rule_name, rule, is_nested); -# } -# rule.push_back({LLAMA_GRETYPE_END, 0}); -# add_rule(state, rule_id, rule); -# return pos; -# } -def parse_alternates(state: ParseState, src: str, rule_name: str, rule_id: int, is_nested: bool) -> str: - rule = [] - pos = parse_sequence(state, src, rule_name, rule, is_nested) - while pos.startswith("|"): - rule.append(GrammarElement(GrammarElementType.ALT, 0)) - pos = parse_space(pos[1:], newline_ok=True) - pos = parse_sequence(state, pos, rule_name, rule, is_nested) - rule.append(GrammarElement(GrammarElementType.END, 0)) - add_rule(state, rule_id, rule) - return pos - - -# static const char * parse_rule(parse_state & state, const char * src) { -# const char * name_end = parse_name(src); -# const char * pos = parse_space(name_end, false); -# size_t name_len = name_end - src; -# uint32_t rule_id = get_symbol_id(state, src, name_len); -# const std::string name(src, name_len); -# -# if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { -# throw std::runtime_error(std::string("expecting ::= at ") + pos); -# } -# pos = parse_space(pos + 3, true); -# -# pos = parse_alternates(state, pos, name, rule_id, false); -# -# if (*pos == '\r') { -# pos += pos[1] == '\n' ? 2 : 1; -# } else if (*pos == '\n') { -# pos++; -# } else if (*pos) { -# throw std::runtime_error(std::string("expecting newline or end at ") + pos); -# } -# return parse_space(pos, true); -# } -def parse_rule(state: ParseState, src: str) -> str: - pos = src - name, pos = parse_name(pos) - pos = parse_space(pos, newline_ok=False) - rule_id = get_symbol_id(state, name) - - if not pos.startswith("::="): - raise ValueError(f"expecting ::= at {pos}") - - pos = parse_space(pos[3:], newline_ok=True) - - pos = parse_alternates(state, pos, name, rule_id, is_nested=False) - - if pos.startswith("\r"): - pos = pos[2:] if pos[1] == "\n" else pos[1:] - elif pos.startswith("\n"): - pos = pos[1:] - elif pos: - raise ValueError(f"expecting newline or end at {pos}") - return parse_space(pos, newline_ok=True) - - -# parse_state parse(const char * src) { -# try { -# parse_state state; -# const char * pos = parse_space(src, true); -# while (*pos) { -# pos = parse_rule(state, pos); -# } -# // Validate the state to ensure that all rules are defined -# for (const auto & rule : state.rules) { -# for (const auto & elem : rule) { -# if (elem.type == LLAMA_GRETYPE_RULE_REF) { -# // Ensure that the rule at that location exists -# if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { -# // Get the name of the rule that is missing -# for (const auto & kv : state.symbol_ids) { -# if (kv.second == elem.value) { -# throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); -# } -# } -# } -# } -# } -# } -# return state; -# } catch (const std::exception & err) { -# fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); -# return parse_state(); -# } -# } -def parse(src: str) -> ParseState: - state = ParseState() - pos = src - pos = parse_space(pos, newline_ok=True) - while pos: - pos = parse_rule(state, pos) - # validate - for rule in state.rules: - for elem in rule: - if elem.type == GrammarElementType.RULE_REF: - if elem.value >= len(state.rules) or not state.rules[elem.value]: - for k, v in state.symbol_ids.items(): - if v == elem.value: - raise ValueError(f"Undefined rule identifier '{k}'") - return state - - -# static bool is_char_element(llama_grammar_element elem) { -# switch (elem.type) { -# case LLAMA_GRETYPE_CHAR: return true; -# case LLAMA_GRETYPE_CHAR_NOT: return true; -# case LLAMA_GRETYPE_CHAR_ALT: return true; -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; -# case LLAMA_GRETYPE_CHAR_ANY: return true; -# default: return false; -# } -# } -def is_char_element(elem: GrammarElement) -> bool: - return elem.type in ( - GrammarElementType.CHAR, - GrammarElementType.CHAR_NOT, - GrammarElementType.CHAR_ALT, - GrammarElementType.CHAR_RNG_UPPER, - GrammarElementType.CHAR_ANY - ) - - -def print_grammar_char(file: typing.TextIO, c: int) -> None: - if 0x20 <= c <= 0x7f: - print(chr(c), end="", file=file) - else: - print(f"", end="", file=file) - - -# static void print_rule( -# FILE * file, -# uint32_t rule_id, -# const std::vector & rule, -# const std::map & symbol_id_names) { -# if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { -# throw std::runtime_error( -# "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); -# } -# fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); -# for (size_t i = 0, end = rule.size() - 1; i < end; i++) { -# llama_grammar_element elem = rule[i]; -# switch (elem.type) { -# case LLAMA_GRETYPE_END: -# throw std::runtime_error( -# "unexpected end of rule: " + std::to_string(rule_id) + "," + -# std::to_string(i)); -# case LLAMA_GRETYPE_ALT: -# fprintf(file, "| "); -# break; -# case LLAMA_GRETYPE_RULE_REF: -# fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); -# break; -# case LLAMA_GRETYPE_CHAR: -# fprintf(file, "["); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_NOT: -# fprintf(file, "[^"); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: -# if (i == 0 || !is_char_element(rule[i - 1])) { -# throw std::runtime_error( -# "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + -# std::to_string(rule_id) + "," + std::to_string(i)); -# } -# fprintf(file, "-"); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_ALT: -# if (i == 0 || !is_char_element(rule[i - 1])) { -# throw std::runtime_error( -# "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + -# std::to_string(rule_id) + "," + std::to_string(i)); -# } -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_ANY: -# fprintf(file, "."); -# break; -# } -# if (is_char_element(elem)) { -# switch (rule[i + 1].type) { -# case LLAMA_GRETYPE_CHAR_ALT: -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: -# case LLAMA_GRETYPE_CHAR_ANY: -# break; -# default: -# fprintf(file, "] "); -# } -# } -# } -# fprintf(file, "\n"); -# } -def print_rule( - file: typing.TextIO, - rule_id: int, - rule: typing.List[GrammarElement], - symbol_id_names: typing.Dict[int, str], -) -> None: - if not rule or rule[-1].type != GrammarElementType.END: - raise ValueError(f"malformed rule, does not end with LLAMA_GRETYPE_END: {rule_id}") - - print(f"{symbol_id_names[rule_id]} ::=", end=" ", file=file) - - for i, elem in enumerate(rule[:-1]): - if elem.type == GrammarElementType.END: - raise ValueError(f"unexpected end of rule: {rule_id}, {i}") - if elem.type == GrammarElementType.ALT: - print("| ", end="", file=file) - elif elem.type == GrammarElementType.RULE_REF: - print(f"{symbol_id_names[elem.value]} ", end="", file=file) - elif elem.type == GrammarElementType.CHAR: - print("[", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_NOT: - print("[^", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_RNG_UPPER: - if i == 0 or not is_char_element(rule[i - 1]): - raise ValueError(f"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: {rule_id}, {i}") - print(f"-", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_ALT: - if i == 0 or not is_char_element(rule[i - 1]): - raise ValueError(f"LLAMA_GRETYPE_CHAR_ALT without preceding char: {rule_id}, {i}") - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_ANY: - print(".", end="", file=file) - if is_char_element(elem): - if rule[i + 1].type in (GrammarElementType.CHAR_ALT, GrammarElementType.CHAR_RNG_UPPER, GrammarElementType.CHAR_ANY): - continue - print("] ", end="", file=file) - print(file=file) - - -def print_grammar(file: typing.TextIO, state: ParseState) -> None: - try: - symbol_id_names = {v: k for k, v in state.symbol_ids.items()} - for i, rule in enumerate(state.rules): - print_rule(file, i, rule, symbol_id_names) - except Exception as err: - print(f"\nerror printing grammar: {err}", file=file) - raise err LLAMA_GRAMMAR_DEFAULT_ROOT = "root" diff --git a/llama_cpp_python/llama_speculative.py b/llama_cpp/llama_speculative.py similarity index 100% rename from llama_cpp_python/llama_speculative.py rename to llama_cpp/llama_speculative.py diff --git a/llama_cpp_python/llama_tokenizer.py b/llama_cpp/llama_tokenizer.py similarity index 94% rename from llama_cpp_python/llama_tokenizer.py rename to llama_cpp/llama_tokenizer.py index 3a8db0c70..1375e1392 100644 --- a/llama_cpp_python/llama_tokenizer.py +++ b/llama_cpp/llama_tokenizer.py @@ -7,8 +7,8 @@ Any, ) -import llama_cpp_python -from llama_cpp_python.llama_types import List +import llama_cpp +from llama_cpp.llama_types import List class BaseLlamaTokenizer(abc.ABC): @@ -43,7 +43,7 @@ def detokenize( class LlamaTokenizer(BaseLlamaTokenizer): - def __init__(self, llama: llama_cpp_python.Llama): + def __init__(self, llama: llama_cpp.Llama): self._model = llama._model # type: ignore def tokenize( @@ -71,7 +71,7 @@ def decode(self, tokens: List[int]) -> str: @classmethod def from_ggml_file(cls, path: str) -> "LlamaTokenizer": - return cls(llama_cpp_python.Llama(model_path=path, vocab_only=True)) + return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) class LlamaHFTokenizer(BaseLlamaTokenizer): diff --git a/llama_cpp_python/llama_types.py b/llama_cpp/llama_types.py similarity index 99% rename from llama_cpp_python/llama_types.py rename to llama_cpp/llama_types.py index e72f65550..bbb58afc3 100644 --- a/llama_cpp_python/llama_types.py +++ b/llama_cpp/llama_types.py @@ -190,7 +190,6 @@ class ChatCompletionRequestSystemMessage(TypedDict): class ChatCompletionRequestUserMessage(TypedDict): role: Literal["user"] - name: NotRequired[str] content: Optional[Union[str, List[ChatCompletionRequestMessageContentPart]]] @@ -215,7 +214,6 @@ class ChatCompletionRequestAssistantMessageFunctionCall(TypedDict): class ChatCompletionRequestAssistantMessage(TypedDict): role: Literal["assistant"] - name: NotRequired[str] content: Optional[str] tool_calls: NotRequired[ChatCompletionMessageToolCalls] function_call: NotRequired[ @@ -237,6 +235,7 @@ class ChatCompletionRequestFunctionMessage(TypedDict): ChatCompletionRequestMessage = Union[ ChatCompletionRequestSystemMessage, + ChatCompletionRequestUserMessage, ChatCompletionRequestAssistantMessage, ChatCompletionRequestUserMessage, ChatCompletionRequestToolMessage, diff --git a/llama_cpp_python/llava_cpp.py b/llama_cpp/llava_cpp.py similarity index 98% rename from llama_cpp_python/llava_cpp.py rename to llama_cpp/llava_cpp.py index f3ff2516d..d9dfaf5fd 100644 --- a/llama_cpp_python/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -20,7 +20,7 @@ TYPE_CHECKING, ) -import llama_cpp_python.llama_cpp as llama_cpp +import llama_cpp.llama_cpp as llama_cpp from llama_cpp._ctypes_extensions import ( load_shared_library, diff --git a/llama_cpp_python/py.typed b/llama_cpp/py.typed similarity index 100% rename from llama_cpp_python/py.typed rename to llama_cpp/py.typed diff --git a/llama_cpp_python/server/__init__.py b/llama_cpp/server/__init__.py similarity index 100% rename from llama_cpp_python/server/__init__.py rename to llama_cpp/server/__init__.py diff --git a/llama_cpp_python/server/__main__.py b/llama_cpp/server/__main__.py similarity index 90% rename from llama_cpp_python/server/__main__.py rename to llama_cpp/server/__main__.py index 78402a3df..bbac4957e 100644 --- a/llama_cpp_python/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -9,13 +9,13 @@ Then run: ``` -uvicorn llama_cpp_python.server.app:create_app --reload +uvicorn llama_cpp.server.app:create_app --reload ``` or ``` -python3 -m llama_cpp_python.server +python3 -m llama_cpp.server ``` Then visit http://localhost:8000/docs to see the interactive API docs. @@ -30,14 +30,14 @@ import uvicorn -from llama_cpp_python.server.app import create_app -from llama_cpp_python.server.settings import ( +from llama_cpp.server.app import create_app +from llama_cpp.server.settings import ( Settings, ServerSettings, ModelSettings, ConfigFileSettings, ) -from llama_cpp_python.server.cli import add_args_from_model, parse_model_from_args +from llama_cpp.server.cli import add_args_from_model, parse_model_from_args def main(): diff --git a/llama_cpp_python/server/app.py b/llama_cpp/server/app.py similarity index 82% rename from llama_cpp_python/server/app.py rename to llama_cpp/server/app.py index 9d629b3a0..cd3255176 100644 --- a/llama_cpp_python/server/app.py +++ b/llama_cpp/server/app.py @@ -1,39 +1,37 @@ from __future__ import annotations -import contextlib -import json -import logging import os +import json import typing -from functools import partial +import contextlib + from threading import Lock +from functools import partial from typing import Iterator, List, Optional, Union, Dict +import llama_cpp + import anyio -import numpy as np from anyio.streams.memory import MemoryObjectSendStream +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request, HTTPException, status, Body from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware from fastapi.security import HTTPBearer -from sentence_transformers import SentenceTransformer from sse_starlette.sse import EventSourceResponse -from starlette.concurrency import run_in_threadpool, iterate_in_threadpool -from starlette_context.middleware import RawContextMiddleware from starlette_context.plugins import RequestIdPlugin # type: ignore +from starlette_context.middleware import RawContextMiddleware -import llama_cpp_python -from llama_cpp_python.server.errors import RouteErrorHandler -from llama_cpp_python.server.model import ( +from llama_cpp.server.model import ( LlamaProxy, ) -from llama_cpp_python.server.settings import ( +from llama_cpp.server.settings import ( ConfigFileSettings, Settings, ModelSettings, ServerSettings, ) -from llama_cpp_python.server.types import ( +from llama_cpp.server.types import ( CreateCompletionRequest, CreateEmbeddingRequest, CreateChatCompletionRequest, @@ -44,12 +42,12 @@ DetokenizeInputRequest, DetokenizeInputResponse, ) +from llama_cpp.server.errors import RouteErrorHandler + router = APIRouter(route_class=RouteErrorHandler) _server_settings: Optional[ServerSettings] = None -_model_settings: Optional[List[ModelSettings]] = None -hf_embedding_model: dict[str, SentenceTransformer] = dict() def set_server_settings(server_settings: ServerSettings): @@ -99,11 +97,6 @@ def set_ping_message_factory(factory: typing.Callable[[], bytes]): _ping_message_factory = factory -def set_model_settings(model_settings: List[ModelSettings]): - global _model_settings - _model_settings = model_settings - - def create_app( settings: Settings | None = None, server_settings: ServerSettings | None = None, @@ -137,12 +130,11 @@ def create_app( ), "server_settings and model_settings must be provided together" set_server_settings(server_settings) - set_model_settings(model_settings) middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] app = FastAPI( middleware=middleware, title="🦙 llama.cpp Python API", - version=llama_cpp_python.__version__, + version=llama_cpp.__version__, root_path=server_settings.root_path, ) app.add_middleware( @@ -194,7 +186,7 @@ async def get_event_publisher( def _logit_bias_tokens_to_input_ids( - llama: llama_cpp_python.Llama, + llama: llama_cpp.Llama, logit_bias: Dict[str, float], ) -> Dict[str, float]: to_bias: Dict[str, float] = {} @@ -237,7 +229,7 @@ async def authenticate( summary="Completion", dependencies=[Depends(authenticate)], response_model=Union[ - llama_cpp_python.CreateCompletionResponse, + llama_cpp.CreateCompletionResponse, str, ], responses={ @@ -274,7 +266,7 @@ async def authenticate( async def create_completion( request: Request, body: CreateCompletionRequest, -) -> llama_cpp_python.Completion: +) -> llama_cpp.Completion: exit_stack = contextlib.ExitStack() llama_proxy = await run_in_threadpool( lambda: exit_stack.enter_context(contextlib.contextmanager(get_llama_proxy)()) @@ -311,11 +303,11 @@ async def create_completion( ) if body.grammar is not None: - kwargs["grammar"] = llama_cpp_python.LlamaGrammar.from_string(body.grammar) + kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp_python.LogitsProcessorList( - [llama_cpp_python.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor @@ -323,8 +315,8 @@ async def create_completion( kwargs["logits_processor"].extend(_min_tokens_logits_processor) iterator_or_completion: Union[ - llama_cpp_python.CreateCompletionResponse, - Iterator[llama_cpp_python.CreateCompletionStreamResponse], + llama_cpp.CreateCompletionResponse, + Iterator[llama_cpp.CreateCompletionStreamResponse], ] = await run_in_threadpool(llama, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -333,7 +325,7 @@ async def create_completion( # If no exception was raised from first_response, we can assume that # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp_python.CreateCompletionStreamResponse]: + def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: yield first_response yield from iterator_or_completion exit_stack.close() @@ -365,53 +357,17 @@ async def create_embedding( request: CreateEmbeddingRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), ): - setting = None - for model in _model_settings: - if request.model is None and model.embedding: # if no specify model, use first embedding model - setting = model - break - elif model.embedding and (request.model == model.model or request.model == model.model_alias): - setting = model - break - if setting is None: - raise ValueError('no embedding model or no match correct embedding model name. use embedding=True to note embedding model') - - if setting.is_hf_embedding_model: - if setting.model not in hf_embedding_model: - logging.info(f'load {setting.model}') - hf_embedding_model[setting.model] = SentenceTransformer(setting.model, device='cpu' if setting.n_ctx != -1 else None) - model = hf_embedding_model[setting.model] - embeds: np.ndarray = model.encode(request.input if isinstance(request.input, list) else [request.input], normalize_embeddings=True) - data = [ - { - "object": "embedding", - "embedding": emb.tolist(), - "index": idx, - } - for (idx, emb) in enumerate(embeds) - ] - total_tokens = model.tokenize(request.input if isinstance(request.input, list) else [request.input])['attention_mask'].sum().item() - return { - "object": "list", - "data": data, - "model": setting.model_alias if setting.model_alias is not None else setting.model, - "usage": { - "prompt_tokens": total_tokens, - "total_tokens": total_tokens, - }, - } - else: - return await run_in_threadpool( - llama_proxy(request.model).create_embedding, - **request.model_dump(exclude={"user"}), - ) + return await run_in_threadpool( + llama_proxy(request.model).create_embedding, + **request.model_dump(exclude={"user"}), + ) @router.post( "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)], - response_model=Union[llama_cpp_python.ChatCompletion, str], + response_model=Union[llama_cpp.ChatCompletion, str], responses={ "200": { "description": "Successful Response", @@ -511,7 +467,7 @@ async def create_chat_completion( }, } ), -) -> EventSourceResponse: +) -> llama_cpp.ChatCompletion: # This is a workaround for an issue in FastAPI dependencies # where the dependency is cleaned up before a StreamingResponse # is complete. @@ -541,11 +497,11 @@ async def create_chat_completion( ) if body.grammar is not None: - kwargs["grammar"] = llama_cpp_python.LlamaGrammar.from_string(body.grammar) + kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar) if body.min_tokens > 0: - _min_tokens_logits_processor = llama_cpp_python.LogitsProcessorList( - [llama_cpp_python.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] + _min_tokens_logits_processor = llama_cpp.LogitsProcessorList( + [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())] ) if "logits_processor" not in kwargs: kwargs["logits_processor"] = _min_tokens_logits_processor @@ -553,7 +509,7 @@ async def create_chat_completion( kwargs["logits_processor"].extend(_min_tokens_logits_processor) iterator_or_completion: Union[ - llama_cpp_python.ChatCompletion, Iterator[llama_cpp_python.ChatCompletionChunk] + llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk] ] = await run_in_threadpool(llama.create_chat_completion, **kwargs) if isinstance(iterator_or_completion, Iterator): @@ -562,7 +518,7 @@ async def create_chat_completion( # If no exception was raised from first_response, we can assume that # the iterator is valid and we can use it to stream the response. - def iterator() -> Iterator[llama_cpp_python.ChatCompletionChunk]: + def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: yield first_response yield from iterator_or_completion exit_stack.close() @@ -654,18 +610,3 @@ async def detokenize( text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") return DetokenizeInputResponse(text=text) - - -@router.get( - '/v1/internal/model/info', - summary='Model info', - dependencies=[Depends(authenticate)], - tags=[openai_v1_tag], -) -async def info(): - for model in _model_settings: - if _llama_proxy._current_model.model_path == model.model: - return { - "model_name": model.model_alias if model.model_alias is not None else model.model, - "lora_names": _llama_proxy._current_model.lora_path - } \ No newline at end of file diff --git a/llama_cpp_python/server/cli.py b/llama_cpp/server/cli.py similarity index 100% rename from llama_cpp_python/server/cli.py rename to llama_cpp/server/cli.py diff --git a/llama_cpp_python/server/errors.py b/llama_cpp/server/errors.py similarity index 99% rename from llama_cpp_python/server/errors.py rename to llama_cpp/server/errors.py index de9246e08..fbf9fd80d 100644 --- a/llama_cpp_python/server/errors.py +++ b/llama_cpp/server/errors.py @@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse from fastapi.routing import APIRoute -from llama_cpp_python.server.types import ( +from llama_cpp.server.types import ( CreateCompletionRequest, CreateEmbeddingRequest, CreateChatCompletionRequest, @@ -102,7 +102,7 @@ def model_not_found( class RouteErrorHandler(APIRoute): """Custom APIRoute that handles application errors and exceptions""" - # key: regex pattern for original error message from llama_cpp_python + # key: regex pattern for original error message from llama_cpp # value: formatter function pattern_and_formatters: Dict[ "Pattern[str]", diff --git a/llama_cpp_python/server/model.py b/llama_cpp/server/model.py similarity index 83% rename from llama_cpp_python/server/model.py rename to llama_cpp/server/model.py index 1f021584f..c6716f919 100644 --- a/llama_cpp_python/server/model.py +++ b/llama_cpp/server/model.py @@ -4,11 +4,11 @@ from typing import Dict, Optional, Union, List -import llama_cpp_python -import llama_cpp_python.llama_speculative as llama_speculative -import llama_cpp_python.llama_tokenizer as llama_tokenizer +import llama_cpp +import llama_cpp.llama_speculative as llama_speculative +import llama_cpp.llama_tokenizer as llama_tokenizer -from llama_cpp_python.server.settings import ModelSettings +from llama_cpp.server.settings import ModelSettings class LlamaProxy: @@ -21,7 +21,7 @@ def __init__(self, models: List[ModelSettings]) -> None: model.model_alias = model.model self._model_settings_dict[model.model_alias] = model - self._current_model: Optional[llama_cpp_python.Llama] = None + self._current_model: Optional[llama_cpp.Llama] = None self._current_model_alias: Optional[str] = None self._default_model_settings: ModelSettings = models[0] @@ -33,7 +33,7 @@ def __init__(self, models: List[ModelSettings]) -> None: ) self._current_model_alias = self._default_model_alias - def __call__(self, model: Optional[str] = None) -> llama_cpp_python.Llama: + def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if model is None: model = self._default_model_alias @@ -71,97 +71,92 @@ def free(self): del self._current_model @staticmethod - def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python.Llama: + def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.Llava15ChatHandler.from_pretrained( + llama_cpp.llama_chat_format.Llava15ChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.Llava15ChatHandler( + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "obsidian": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.ObsidianChatHandler.from_pretrained( + llama_cpp.llama_chat_format.ObsidianChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.ObsidianChatHandler( + chat_handler = llama_cpp.llama_chat_format.ObsidianChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "llava-1-6": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.Llava16ChatHandler.from_pretrained( + llama_cpp.llama_chat_format.Llava16ChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.Llava16ChatHandler( + chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "moondream": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.MoondreamChatHandler.from_pretrained( + llama_cpp.llama_chat_format.MoondreamChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.MoondreamChatHandler( + chat_handler = llama_cpp.llama_chat_format.MoondreamChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "nanollava": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.NanoLlavaChatHandler.from_pretrained( + llama_cpp.llama_chat_format.NanoLlavaChatHandler.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.NanoLlavaChatHandler( + chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) elif settings.chat_format == "llama-3-vision-alpha": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: chat_handler = ( - llama_cpp_python.llama_chat_format.Llama3VisionAlpha.from_pretrained( + llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained( repo_id=settings.hf_model_repo_id, filename=settings.clip_model_path, verbose=settings.verbose, ) ) else: - chat_handler = llama_cpp_python.llama_chat_format.Llama3VisionAlpha( + chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) - elif settings.chat_format == 'banban-chat': - assert settings.clip_model_path is not None, "clip model not found" - chat_handler = llama_cpp_python.llama_chat_format.BanBanChat( - clip_model_path=settings.clip_model_path, verbose=settings.verbose - ) elif settings.chat_format == "minicpm-v-2.6": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: @@ -181,7 +176,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python. settings.hf_pretrained_model_name_or_path is not None ), "hf_pretrained_model_name_or_path must be set for hf-autotokenizer" chat_handler = ( - llama_cpp_python.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( + llama_cpp.llama_chat_format.hf_autotokenizer_to_chat_completion_handler( settings.hf_pretrained_model_name_or_path ) ) @@ -189,11 +184,11 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python. assert ( settings.hf_tokenizer_config_path is not None ), "hf_tokenizer_config_path must be set for hf-tokenizer-config" - chat_handler = llama_cpp_python.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( + chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler( json.load(open(settings.hf_tokenizer_config_path)) ) - tokenizer: Optional[llama_cpp_python.BaseLlamaTokenizer] = None + tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None if settings.hf_pretrained_model_name_or_path is not None: tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained( settings.hf_pretrained_model_name_or_path @@ -230,12 +225,12 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python. if settings.hf_model_repo_id is not None: create_fn = functools.partial( - llama_cpp_python.Llama.from_pretrained, + llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model, ) else: - create_fn = llama_cpp_python.Llama + create_fn = llama_cpp.Llama kwargs["model_path"] = settings.model _model = create_fn( @@ -294,10 +289,10 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp_python. if settings.cache_type == "disk": if settings.verbose: print(f"Using disk cache with size {settings.cache_size}") - cache = llama_cpp_python.LlamaDiskCache(capacity_bytes=settings.cache_size) + cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: print(f"Using ram cache with size {settings.cache_size}") - cache = llama_cpp_python.LlamaRAMCache(capacity_bytes=settings.cache_size) + cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) _model.set_cache(cache) return _model diff --git a/llama_cpp_python/server/settings.py b/llama_cpp/server/settings.py similarity index 90% rename from llama_cpp_python/server/settings.py rename to llama_cpp/server/settings.py index fcbe863d5..13c951241 100644 --- a/llama_cpp_python/server/settings.py +++ b/llama_cpp/server/settings.py @@ -8,7 +8,7 @@ from pydantic import Field, model_validator from pydantic_settings import BaseSettings -import llama_cpp_python +import llama_cpp # Disable warning for model and model_alias settings BaseSettings.model_config["protected_namespaces"] = () @@ -31,7 +31,7 @@ class ModelSettings(BaseSettings): description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", ) split_mode: int = Field( - default=llama_cpp_python.LLAMA_SPLIT_MODE_LAYER, + default=llama_cpp.LLAMA_SPLIT_MODE_LAYER, description="The split mode to use.", ) main_gpu: int = Field( @@ -47,11 +47,11 @@ class ModelSettings(BaseSettings): default=False, description="Whether to only return the vocabulary." ) use_mmap: bool = Field( - default=llama_cpp_python.llama_supports_mmap(), + default=llama_cpp.llama_supports_mmap(), description="Use mmap.", ) use_mlock: bool = Field( - default=llama_cpp_python.llama_supports_mlock(), + default=llama_cpp.llama_supports_mlock(), description="Use mlock.", ) kv_overrides: Optional[List[str]] = Field( @@ -64,7 +64,7 @@ class ModelSettings(BaseSettings): ) # Context Params seed: int = Field( - default=llama_cpp_python.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." + default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." ) n_ctx: int = Field(default=2048, ge=0, description="The context size.") n_batch: int = Field( @@ -84,7 +84,7 @@ class ModelSettings(BaseSettings): description="The number of threads to use when batch processing. Use -1 for max cpu threads", ) rope_scaling_type: int = Field( - default=llama_cpp_python.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED + default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( @@ -100,11 +100,6 @@ class ModelSettings(BaseSettings): ) logits_all: bool = Field(default=True, description="Whether to return logits.") embedding: bool = Field(default=False, description="Whether to use embeddings.") - is_hf_embedding_model: bool = Field(default=False, description='if embedding=True and is_hf_embedding_model=True, ' - 'it will use sentence_transformers to load model' - '(it will persist in memory, so do not use big model ' - 'if your memory is not big enough)') - offload_kqv: bool = Field( default=True, description="Whether to offload kqv to the GPU." ) diff --git a/llama_cpp_python/server/types.py b/llama_cpp/server/types.py similarity index 95% rename from llama_cpp_python/server/types.py rename to llama_cpp/server/types.py index 6465c3337..fdd164456 100644 --- a/llama_cpp_python/server/types.py +++ b/llama_cpp/server/types.py @@ -5,7 +5,7 @@ from pydantic import BaseModel, Field -import llama_cpp_python +import llama_cpp model_field = Field( @@ -190,22 +190,22 @@ class ChatCompletionRequestMessage(BaseModel): class CreateChatCompletionRequest(BaseModel): - messages: List[llama_cpp_python.ChatCompletionRequestMessage] = Field( + messages: List[llama_cpp.ChatCompletionRequestMessage] = Field( default=[], description="A list of messages to generate completions for." ) - functions: Optional[List[llama_cpp_python.ChatCompletionFunction]] = Field( + functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( default=None, description="A list of functions to apply to the generated completions.", ) - function_call: Optional[llama_cpp_python.ChatCompletionRequestFunctionCall] = Field( + function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field( default=None, description="A function to apply to the generated completions.", ) - tools: Optional[List[llama_cpp_python.ChatCompletionTool]] = Field( + tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field( default=None, description="A list of tools to apply to the generated completions.", ) - tool_choice: Optional[llama_cpp_python.ChatCompletionToolChoiceOption] = Field( + tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( default=None, description="A tool to apply to the generated completions.", ) # TODO: verify @@ -232,7 +232,7 @@ class CreateChatCompletionRequest(BaseModel): frequency_penalty: Optional[float] = frequency_penalty_field logit_bias: Optional[Dict[str, float]] = Field(None) seed: Optional[int] = Field(None) - response_format: Optional[llama_cpp_python.ChatCompletionRequestResponseFormat] = Field( + response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( default=None, )