Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 22c55cd

Browse files
authored
Merge branch 'main' into generic-vlm-chat-format
2 parents 8f09d42 + c07db99 commit 22c55cd

File tree

8 files changed

+77
-54
lines changed

8 files changed

+77
-54
lines changed

.github/dependabot.yml

+4
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ updates:
99
directory: "/" # Location of package manifests
1010
schedule:
1111
interval: "weekly"
12+
- package-ecosystem: "github-actions"
13+
directory: "/"
14+
schedule:
15+
interval: "weekly"

.github/workflows/build-and-release.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/cibuildwheel@v2.16.5
32+
uses: pypa/cibuildwheel@v2.17.0
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -55,7 +55,7 @@ jobs:
5555
platforms: linux/arm64
5656

5757
- name: Build wheels
58-
uses: pypa/cibuildwheel@v2.16.5
58+
uses: pypa/cibuildwheel@v2.17.0
5959
env:
6060
CIBW_SKIP: "*musllinux* pp*"
6161
CIBW_REPAIR_WHEEL_COMMAND: ""

.github/workflows/build-docker.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@ jobs:
2323
uses: docker/setup-buildx-action@v2
2424

2525
- name: Login to GitHub Container Registry
26-
uses: docker/login-action@v2
26+
uses: docker/login-action@v3
2727
with:
2828
registry: ghcr.io
2929
username: ${{ github.repository_owner }}
3030
password: ${{ secrets.GITHUB_TOKEN }}
3131

3232
- name: Build and push
3333
id: docker_build
34-
uses: docker/build-push-action@v4
34+
uses: docker/build-push-action@v5
3535
with:
3636
context: .
3737
file: "docker/simple/Dockerfile"

.github/workflows/build-wheels-cuda.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
python-version: ${{ matrix.pyver }}
5353

5454
- name: Setup Mamba
55-
uses: conda-incubator/setup-miniconda@v2.2.0
55+
uses: conda-incubator/setup-miniconda@v3.0.4
5656
with:
5757
activate-environment: "build"
5858
python-version: ${{ matrix.pyver }}
@@ -65,7 +65,7 @@ jobs:
6565
- name: VS Integration Cache
6666
id: vs-integration-cache
6767
if: runner.os == 'Windows'
68-
uses: actions/cache@v3.3.2
68+
uses: actions/cache@v4.0.2
6969
with:
7070
path: ./MSBuildExtensions
7171
key: cuda-${{ matrix.cuda }}-vs-integration

.github/workflows/build-wheels-metal.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ jobs:
4141
with:
4242
submodules: "recursive"
4343

44-
- uses: actions/setup-python@v4
44+
- uses: actions/setup-python@v5
4545
with:
4646
python-version: ${{ matrix.pyver }}
4747

@@ -78,7 +78,7 @@ jobs:
7878
VERBOSE=1 python -m build --wheel
7979
fi
8080
81-
- uses: softprops/action-gh-release@v1
81+
- uses: softprops/action-gh-release@v2
8282
with:
8383
files: dist/*
8484
# set release name to <tag>-metal

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi
484484
tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
485485
)
486486
```
487+
488+
**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
487489
</details>
488490

489491
### Multi-modal Models

examples/low_level_api/quantize.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@
44

55

66
def main(args):
7+
fname_inp = args.fname_inp.encode("utf-8")
8+
fname_out = args.fname_out.encode("utf-8")
79
if not os.path.exists(fname_inp):
810
raise RuntimeError(f"Input file does not exist ({fname_inp})")
911
if os.path.exists(fname_out):
1012
raise RuntimeError(f"Output file already exists ({fname_out})")
11-
fname_inp = args.fname_inp.encode("utf-8")
12-
fname_out = args.fname_out.encode("utf-8")
13-
itype = args.itype
14-
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
13+
ftype = args.type
14+
args = llama_cpp.llama_model_quantize_default_params()
15+
args.ftype = ftype
16+
return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, args)
1517
if return_code != 0:
1618
raise RuntimeError("Failed to quantize model")
1719

@@ -20,6 +22,7 @@ def main(args):
2022
parser = argparse.ArgumentParser()
2123
parser.add_argument("fname_inp", type=str, help="Path to input model")
2224
parser.add_argument("fname_out", type=str, help="Path to output model")
23-
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
25+
parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1), see llama_cpp.py for enum")
2426
args = parser.parse_args()
2527
main(args)
28+

llama_cpp/llama_chat_format.py

+55-41
Original file line numberDiff line numberDiff line change
@@ -1830,27 +1830,35 @@ def prepare_messages_for_inference(
18301830
version: Literal["v1", "v2"],
18311831
functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
18321832
tools: Optional[List[llama_types.ChatCompletionTool]] = None,
1833+
tool_choice: Union[Dict, str] = "auto",
18331834
):
18341835
all_messages: List[llama_types.ChatCompletionRequestMessage] = []
1835-
if functions is not None:
1836+
if tool_choice == "none":
18361837
all_messages.append(
18371838
llama_types.ChatCompletionRequestSystemMessage(
1838-
role="system", content=generate_schema_from_functions(functions)
1839+
role="system", content=generate_schema_from_functions([])
18391840
)
18401841
)
1841-
elif tools is not None:
1842-
all_messages.append(
1843-
llama_types.ChatCompletionRequestSystemMessage(
1844-
role="system",
1845-
content=generate_schema_from_functions(
1846-
[
1847-
tool["function"]
1848-
for tool in tools
1849-
if tool["type"] == "function"
1850-
]
1851-
),
1842+
else:
1843+
if functions is not None:
1844+
all_messages.append(
1845+
llama_types.ChatCompletionRequestSystemMessage(
1846+
role="system", content=generate_schema_from_functions(functions)
1847+
)
1848+
)
1849+
elif tools is not None and tool_choice != "none":
1850+
all_messages.append(
1851+
llama_types.ChatCompletionRequestSystemMessage(
1852+
role="system",
1853+
content=generate_schema_from_functions(
1854+
[
1855+
tool["function"]
1856+
for tool in tools
1857+
if tool["type"] == "function"
1858+
]
1859+
),
1860+
)
18521861
)
1853-
)
18541862

18551863
all_messages.append(
18561864
llama_types.ChatCompletionRequestSystemMessage(
@@ -1890,7 +1898,7 @@ def prepare_messages_for_inference(
18901898
function_call = "auto"
18911899

18921900
prompt = prepare_messages_for_inference(
1893-
messages, tokenizer, version, functions, tools
1901+
messages, tokenizer, version, functions, tools, function_call
18941902
)
18951903

18961904
# If no tools/functions are provided
@@ -1987,17 +1995,12 @@ def create_completion(stop):
19871995

19881996
content = ""
19891997
function_calls, function_bodies = [], []
1998+
completion_tokens = 0
19901999

19912000
if version == "v1":
19922001
# If no or "auto" tool_choice/function_call
19932002
if isinstance(function_call, str) and function_call == "auto":
19942003
stops = ["\n", END_ASSISTANT_TOKEN]
1995-
# If tool_choice/function_call is "none"
1996-
elif isinstance(function_call, str) and function_call == "none":
1997-
prompt = prepare_messages_for_inference(
1998-
messages, tokenizer, version, [], []
1999-
)
2000-
stops = END_ASSISTANT_TOKEN
20012004
# If tool_choice/function_call is provided
20022005
elif isinstance(function_call, dict):
20032006
prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
@@ -2011,12 +2014,15 @@ def create_completion(stop):
20112014

20122015
completion = create_completion(stop=stops)
20132016
completion_text = completion["choices"][0]["text"]
2017+
completion_tokens += completion["usage"]["completion_tokens"]
2018+
20142019

20152020
# If the generation does not involve a function call
20162021
if (
20172022
START_FUNCTION_CALL_TOKEN not in prompt
20182023
and START_FUNCTION_CALL_TOKEN not in completion_text
20192024
):
2025+
completion["usage"]["completion_tokens"] = completion_tokens
20202026
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
20212027
# If the generation involves a function call in completion, generate the parameters
20222028
elif (
@@ -2034,30 +2040,22 @@ def create_completion(stop):
20342040
)
20352041
grammar = get_grammar(function_calls[-1])
20362042
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
2043+
completion_tokens += completion["usage"]["completion_tokens"]
20372044
function_bodies.append(completion["choices"][0]["text"].strip())
20382045
# If the prompt involves a function call, just append generated parameters to function_bodies
20392046
else:
20402047
function_bodies.append(completion_text.strip())
20412048
else:
2042-
# If tool_choice/function_call is "none"
2043-
if isinstance(function_call, str) and function_call == "none":
2044-
prompt = (
2045-
prepare_messages_for_inference(messages, tokenizer, version, [], [])
2046-
+ "all\n<|content|>"
2047-
)
2048-
stops = [STOP_TOKEN, FROM_TOKEN]
2049-
completion = create_completion(stop=stops)
2050-
completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
2051-
return _convert_completion_to_chat(completion, stream=stream) # type: ignore
20522049
# If tool_choice/function_call is provided
2053-
elif isinstance(function_call, dict):
2050+
if isinstance(function_call, dict):
20542051
prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
20552052
function_call = function_call["name"]
20562053
function_calls.append(function_call)
20572054
grammar = get_grammar(function_call)
20582055
stops = [STOP_TOKEN, FROM_TOKEN]
20592056
completion = create_completion(stop=stops)
20602057
completion_text = completion["choices"][0]["text"]
2058+
completion_tokens += completion["usage"]["completion_tokens"]
20612059
function_bodies.append(completion_text.strip())
20622060
# If "auto" or no tool_choice/function_call
20632061
elif isinstance(function_call, str) and function_call == "auto":
@@ -2067,6 +2065,7 @@ def create_completion(stop):
20672065
stops = CONTENT_TOKEN
20682066
completion = create_completion(stop=stops)
20692067
completion_text = completion["choices"][0]["text"]
2068+
completion_tokens += completion["usage"]["completion_tokens"]
20702069
function_name = completion_text.strip()
20712070
if function_name == "all":
20722071
prompt += "all\n<|content|>"
@@ -2079,12 +2078,23 @@ def create_completion(stop):
20792078
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
20802079
completion = create_completion(stop=stops)
20812080
completion_text = completion["choices"][0]["text"]
2081+
completion_tokens += completion["usage"]["completion_tokens"]
20822082
if function_name == "all":
2083-
content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
2083+
if completion_text.endswith("\n<|from|>assistant\n"):
2084+
content += completion_text[:-len("\n<|from|>assistant\n")]
2085+
if completion_text.endswith("\n<|from|> assistant\n"):
2086+
content += completion_text[-len("\n<|from|> assistant\n")]
2087+
else:
2088+
content += completion_text
20842089
content = content.lstrip()
20852090
# Check whether the model wants to generate another turn
20862091
if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
2087-
cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
2092+
if completion_text.endswith("\n<|from|>assistant\n"):
2093+
cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
2094+
elif completion_text.endswith("\n<|from|> assistant\n"):
2095+
cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
2096+
else:
2097+
cleaned_completion_text = completion_text.strip()
20882098
prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
20892099
else:
20902100
break
@@ -2094,6 +2104,7 @@ def create_completion(stop):
20942104
prompt += completion_text.strip()
20952105
grammar = None
20962106
completion = create_completion(stop=stops)
2107+
completion_tokens += completion["usage"]["completion_tokens"]
20972108
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
20982109
prompt += "\n<|from|>assistant\n<|recipient|>"
20992110
else:
@@ -2122,12 +2133,16 @@ def create_completion(stop):
21222133
)
21232134

21242135
# TODO: support stream mode
2125-
function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {
2126-
"function_call": {
2127-
"name": tool_calls[0]["function"]["name"],
2128-
"arguments": tool_calls[0]["function"]["arguments"],
2129-
}
2130-
} if len(tool_calls) == 1 else {}
2136+
function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
2137+
if len(tool_calls) > 0:
2138+
if tools is not None:
2139+
function_call_dict["tool_calls"] = tool_calls
2140+
else:
2141+
function_call_dict["function_call"] = {
2142+
"name": tool_calls[0]["function"]["name"],
2143+
"arguments": tool_calls[0]["function"]["arguments"],
2144+
}
2145+
completion["usage"]["completion_tokens"] = completion_tokens
21312146
return llama_types.CreateChatCompletionResponse(
21322147
id="chat" + completion["id"],
21332148
object="chat.completion",
@@ -2140,7 +2155,6 @@ def create_completion(stop):
21402155
"message": {
21412156
"role": "assistant",
21422157
"content": None if content == "" else content,
2143-
"tool_calls": tool_calls,
21442158
**function_call_dict,
21452159
},
21462160
"finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",

0 commit comments

Comments
 (0)