Merge branch 'main' into generic-vlm-chat-format

abetlen · web-flow · commit 22c55cd103d2 · 2024-04-27T22:34:20.000-04:00
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -9,3 +9,7 @@ updates:
     directory: "/" # Location of package manifests
     schedule:
       interval: "weekly"
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"    
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -29,7 +29,7 @@ jobs:
           python -m pip install -e .[all]
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.16.5
+        uses: pypa/cibuildwheel@v2.17.0
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -55,7 +55,7 @@ jobs:
           platforms: linux/arm64
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.16.5
+        uses: pypa/cibuildwheel@v2.17.0
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
@@ -23,15 +23,15 @@ jobs:
         uses: docker/setup-buildx-action@v2
 
       - name: Login to GitHub Container Registry
-        uses: docker/login-action@v2 
+        uses: docker/login-action@v3 
         with:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Build and push
         id: docker_build
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v5
         with:
           context: .
           file: "docker/simple/Dockerfile"
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
@@ -52,7 +52,7 @@ jobs:
           python-version: ${{ matrix.pyver }}
 
       - name: Setup Mamba
-        uses: conda-incubator/setup-miniconda@v2.2.0
+        uses: conda-incubator/setup-miniconda@v3.0.4
         with:
           activate-environment: "build"
           python-version: ${{ matrix.pyver }}
@@ -65,7 +65,7 @@ jobs:
       - name: VS Integration Cache
         id: vs-integration-cache
         if: runner.os == 'Windows'
-        uses: actions/cache@v3.3.2
+        uses: actions/cache@v4.0.2
         with:
           path: ./MSBuildExtensions
           key: cuda-${{ matrix.cuda }}-vs-integration
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
@@ -41,7 +41,7 @@ jobs:
         with:
           submodules: "recursive"
 
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.pyver }}
 
@@ -78,7 +78,7 @@ jobs:
             VERBOSE=1 python -m build --wheel
           fi
 
-      - uses: softprops/action-gh-release@v1
+      - uses: softprops/action-gh-release@v2
         with:
           files: dist/*
           # set release name to <tag>-metal
diff --git a/README.md b/README.md
@@ -484,6 +484,8 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi
   tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
 )
 ```
+
+**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
 </details>
 
 ### Multi-modal Models
diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py
@@ -4,14 +4,16 @@
 
 
 def main(args):
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
     if not os.path.exists(fname_inp):
         raise RuntimeError(f"Input file does not exist ({fname_inp})")
     if os.path.exists(fname_out):
         raise RuntimeError(f"Output file already exists ({fname_out})")
-    fname_inp = args.fname_inp.encode("utf-8")
-    fname_out = args.fname_out.encode("utf-8")
-    itype = args.itype
-    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    ftype = args.type
+    args = llama_cpp.llama_model_quantize_default_params()
+    args.ftype = ftype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
     if return_code != 0:
         raise RuntimeError("Failed to quantize model")
 
@@ -20,6 +22,7 @@ def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument("fname_inp", type=str, help="Path to input model")
     parser.add_argument("fname_out", type=str, help="Path to output model")
-    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum")
     args = parser.parse_args()
     main(args)
+
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -1830,27 +1830,35 @@ def prepare_messages_for_inference(
         version: Literal["v1", "v2"],
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Union[Dict, str] = "auto",
     ):
         all_messages: List[llama_types.ChatCompletionRequestMessage] = []
-        if functions is not None:
+        if tool_choice == "none":
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=generate_schema_from_functions(functions)
+                    role="system", content=generate_schema_from_functions([])
                 )
             )
-        elif tools is not None:
-            all_messages.append(
-                llama_types.ChatCompletionRequestSystemMessage(
-                    role="system",
-                    content=generate_schema_from_functions(
-                        [
-                            tool["function"]
-                            for tool in tools
-                            if tool["type"] == "function"
-                        ]
-                    ),
+        else:
+            if functions is not None:
+                all_messages.append(
+                    llama_types.ChatCompletionRequestSystemMessage(
+                        role="system", content=generate_schema_from_functions(functions)
+                    )
+                )
+            elif tools is not None and tool_choice != "none":
+                all_messages.append(
+                    llama_types.ChatCompletionRequestSystemMessage(
+                        role="system",
+                        content=generate_schema_from_functions(
+                            [
+                                tool["function"]
+                                for tool in tools
+                                if tool["type"] == "function"
+                            ]
+                        ),
+                    )
                 )
-            )
 
         all_messages.append(
             llama_types.ChatCompletionRequestSystemMessage(
@@ -1890,7 +1898,7 @@ def prepare_messages_for_inference(
         function_call = "auto"
 
     prompt = prepare_messages_for_inference(
-        messages, tokenizer, version, functions, tools
+        messages, tokenizer, version, functions, tools, function_call
     )
 
     # If no tools/functions are provided
@@ -1987,17 +1995,12 @@ def create_completion(stop):
 
     content = ""
     function_calls, function_bodies = [], []
+    completion_tokens = 0
 
     if version == "v1":
         # If no or "auto" tool_choice/function_call
         if isinstance(function_call, str) and function_call == "auto":
             stops = ["\n", END_ASSISTANT_TOKEN]
-        # If tool_choice/function_call is "none"
-        elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(
-                messages, tokenizer, version, [], []
-            )
-            stops = END_ASSISTANT_TOKEN
         # If tool_choice/function_call is provided
         elif isinstance(function_call, dict):
             prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
@@ -2011,12 +2014,15 @@ def create_completion(stop):
 
         completion = create_completion(stop=stops)
         completion_text = completion["choices"][0]["text"]
+        completion_tokens += completion["usage"]["completion_tokens"]
+        
 
         # If the generation does not involve a function call
         if (
             START_FUNCTION_CALL_TOKEN not in prompt
             and START_FUNCTION_CALL_TOKEN not in completion_text
         ):
+            completion["usage"]["completion_tokens"] = completion_tokens
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
         # If the generation involves a function call in completion, generate the parameters
         elif (
@@ -2034,30 +2040,22 @@ def create_completion(stop):
             )
             grammar = get_grammar(function_calls[-1])
             completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            completion_tokens += completion["usage"]["completion_tokens"]
             function_bodies.append(completion["choices"][0]["text"].strip())
         # If the prompt involves a function call, just append generated parameters to function_bodies
         else:
             function_bodies.append(completion_text.strip())
     else:
-        # If tool_choice/function_call is "none"
-        if isinstance(function_call, str) and function_call == "none":
-            prompt = (
-                prepare_messages_for_inference(messages, tokenizer, version, [], [])
-                + "all\n<|content|>"
-            )
-            stops = [STOP_TOKEN, FROM_TOKEN]
-            completion = create_completion(stop=stops)
-            completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
         # If tool_choice/function_call is provided
-        elif isinstance(function_call, dict):
+        if isinstance(function_call, dict):
             prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
             function_call = function_call["name"]
             function_calls.append(function_call)
             grammar = get_grammar(function_call)
             stops = [STOP_TOKEN, FROM_TOKEN]
             completion = create_completion(stop=stops)
             completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
             function_bodies.append(completion_text.strip())
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2067,6 +2065,7 @@ def create_completion(stop):
                 stops = CONTENT_TOKEN
                 completion = create_completion(stop=stops)
                 completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                 function_name = completion_text.strip()
                 if function_name == "all":
                     prompt += "all\n<|content|>"
@@ -2079,12 +2078,23 @@ def create_completion(stop):
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                 completion = create_completion(stop=stops)
                 completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                 if function_name == "all":
-                    content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
+                    if completion_text.endswith("\n<|from|>assistant\n"):
+                        content += completion_text[:-len("\n<|from|>assistant\n")]
+                    if completion_text.endswith("\n<|from|> assistant\n"):
+                        content += completion_text[-len("\n<|from|> assistant\n")]
+                    else:
+                        content += completion_text
                     content = content.lstrip()
                     # Check whether the model wants to generate another turn
                     if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
-                        cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
+                        elif completion_text.endswith("\n<|from|> assistant\n"):
+                            cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
+                        else:
+                            cleaned_completion_text = completion_text.strip()
                         prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                     else:
                         break
@@ -2094,6 +2104,7 @@ def create_completion(stop):
                     prompt += completion_text.strip()
                     grammar = None
                     completion = create_completion(stop=stops)
+                    completion_tokens += completion["usage"]["completion_tokens"]
                     if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
                         prompt += "\n<|from|>assistant\n<|recipient|>"
                     else:
@@ -2122,12 +2133,16 @@ def create_completion(stop):
         )
 
     # TODO: support stream mode
-    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { 
-        "function_call": {
-            "name": tool_calls[0]["function"]["name"],
-            "arguments": tool_calls[0]["function"]["arguments"],
-        }
-    } if len(tool_calls) == 1 else {}
+    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
+    if len(tool_calls) > 0:
+        if tools is not None:
+            function_call_dict["tool_calls"] = tool_calls
+        else:
+            function_call_dict["function_call"] = {
+                "name": tool_calls[0]["function"]["name"],
+                "arguments": tool_calls[0]["function"]["arguments"],
+            }
+    completion["usage"]["completion_tokens"] = completion_tokens
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
         object="chat.completion",
@@ -2140,7 +2155,6 @@ def create_completion(stop):
                 "message": {
                     "role": "assistant",
                     "content": None if content == "" else content,
-                    "tool_calls": tool_calls,
                     **function_call_dict,
                 },
                 "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",

Original file line number	Diff line number	Diff line change
`@@ -484,6 +484,8 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi`
`484`	`484`	`tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")`
`485`	`485`	`)`
`486`	`486`	```
	`487`	`+`
	`488`	`+NOTE: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).`
`487`	`489`	`</details>`
`488`	`490`
`489`	`491`	`### Multi-modal Models`