Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.16"
__version__ = "0.3.17"
63 changes: 63 additions & 0 deletions llama_cpp/llama_chat_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@
# Source: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
LLAMA3_INSTRUCT_CHAT_TEMPLATE = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"

# Source: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/tokenizer_config.json
DEEPSEEK_R1_CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{\"<\uff5cUser\uff5c>\" + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>' + message['content'] + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<\uff5cAssistant\uff5c>' + content + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<\uff5cAssistant\uff5c>'}}{% endif %}"
DEEPSEEK_R1_BOS_TOKEN = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
DEEPSEEK_R1_EOS_TOKEN = "<\uff5cend\u2581of\u2581sentence\uff5c>"

### Chat Completion Handler ###


Expand Down Expand Up @@ -807,6 +812,14 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
if metadata["tokenizer.chat_template"] == LLAMA3_INSTRUCT_CHAT_TEMPLATE:
return "llama-3"

if metadata["tokenizer.chat_template"] == DEEPSEEK_R1_CHAT_TEMPLATE:
return "deepseek-r1"

# Heuristic: detect DeepSeek R1 models by checking for characteristic tokens
chat_template = metadata["tokenizer.chat_template"]
if "<\uff5cUser\uff5c>" in chat_template and "<\uff5cAssistant\uff5c>" in chat_template:
return "deepseek-r1"

return None


Expand Down Expand Up @@ -1395,6 +1408,56 @@ def format_gemma(
return ChatFormatterResponse(prompt=_prompt, stop=_sep)


# Chat format for DeepSeek R1 and distilled models, see more details:
# https://huggingface.co/deepseek-ai/DeepSeek-R1
@register_chat_format("deepseek-r1")
def format_deepseek_r1(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
_bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
_eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
_role_user = "<\uff5cUser\uff5c>"
_role_assistant = "<\uff5cAssistant\uff5c>"

system_message = _get_system_message(messages)
_prompt = _bos + system_message

for message in messages:
role = message["role"]
content = message.get("content", "") or ""
if role == "user" and isinstance(content, str):
_prompt += _role_user + content
elif role == "assistant" and isinstance(content, str):
# Strip thinking content for multi-turn context
if "</think>" in content:
content = content.split("</think>")[-1]
_prompt += _role_assistant + content + _eos

_prompt += _role_assistant
return ChatFormatterResponse(prompt=_prompt, stop=_eos, added_special=True)


# Chat format for DeepSeek R1 distilled models (Qwen-based)
# Uses the same template as DeepSeek R1
@register_chat_format("deepseek-r1-distill-qwen")
def format_deepseek_r1_distill_qwen(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
return format_deepseek_r1(messages, **kwargs)


# Chat format for DeepSeek R1 distilled models (Llama-based)
# Uses the same template as DeepSeek R1
@register_chat_format("deepseek-r1-distill-llama")
def format_deepseek_r1_distill_llama(
messages: List[llama_types.ChatCompletionRequestMessage],
**kwargs: Any,
) -> ChatFormatterResponse:
return format_deepseek_r1(messages, **kwargs)


# Tricky chat formats that require custom chat handlers


Expand Down
156 changes: 152 additions & 4 deletions tests/test_llama_chat_format.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
import json
import sys
import logging
import ctypes
from unittest.mock import MagicMock

import jinja2

from llama_cpp import (
ChatCompletionRequestUserMessage,
)
# Stub the native C library and dependent modules so tests can run
# without compiling llama.cpp
_mock_llama_cpp = MagicMock()
_mock_llama_cpp.llama_log_callback = lambda f: f # decorator passthrough
_mock_llama_cpp.llama_log_set = MagicMock()
sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)

_mock_llama = MagicMock()
_mock_llama.StoppingCriteriaList = list
_mock_llama.LogitsProcessorList = list
_mock_llama.LlamaGrammar = MagicMock
sys.modules.setdefault("llama_cpp.llama", _mock_llama)

import llama_cpp.llama_types as llama_types
import llama_cpp.llama_chat_format as llama_chat_format

from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
from llama_cpp.llama_chat_format import (
hf_tokenizer_config_to_chat_formatter,
guess_chat_format_from_gguf_metadata,
DEEPSEEK_R1_CHAT_TEMPLATE,
)

ChatCompletionRequestUserMessage = llama_types.ChatCompletionRequestUserMessage

def test_mistral_instruct():
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
Expand Down Expand Up @@ -87,3 +107,131 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
)

assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>" "")


def test_deepseek_r1_single_turn():
"""Test DeepSeek R1 format with a single user message."""
messages = [
llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
]
response = llama_chat_format.format_deepseek_r1(messages=messages)

bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
user_tag = "<\uff5cUser\uff5c>"
assistant_tag = "<\uff5cAssistant\uff5c>"

expected = f"{bos}{user_tag}Hello{assistant_tag}"
assert response.prompt == expected
assert response.stop == eos
assert response.added_special is True


def test_deepseek_r1_with_system_message():
"""Test DeepSeek R1 format with a system message."""
messages = [
llama_types.ChatCompletionRequestSystemMessage(role="system", content="You are a helpful assistant."),
llama_types.ChatCompletionRequestUserMessage(role="user", content="Hi"),
]
response = llama_chat_format.format_deepseek_r1(messages=messages)

bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
user_tag = "<\uff5cUser\uff5c>"
assistant_tag = "<\uff5cAssistant\uff5c>"

expected = f"{bos}You are a helpful assistant.{user_tag}Hi{assistant_tag}"
assert response.prompt == expected


def test_deepseek_r1_multi_turn():
"""Test DeepSeek R1 format with multi-turn conversation."""
messages = [
llama_types.ChatCompletionRequestUserMessage(role="user", content="What is 2+2?"),
llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="4"),
llama_types.ChatCompletionRequestUserMessage(role="user", content="And 3+3?"),
]
response = llama_chat_format.format_deepseek_r1(messages=messages)

bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
user_tag = "<\uff5cUser\uff5c>"
assistant_tag = "<\uff5cAssistant\uff5c>"

expected = (
f"{bos}"
f"{user_tag}What is 2+2?"
f"{assistant_tag}4{eos}"
f"{user_tag}And 3+3?"
f"{assistant_tag}"
)
assert response.prompt == expected


def test_deepseek_r1_think_stripping():
"""Test that </think> reasoning content is stripped from assistant messages in multi-turn."""
messages = [
llama_types.ChatCompletionRequestUserMessage(role="user", content="Solve x+1=3"),
llama_types.ChatCompletionRequestAssistantMessage(
role="assistant",
content="<think>Let me solve this step by step. x+1=3, so x=2.</think>x = 2",
),
llama_types.ChatCompletionRequestUserMessage(role="user", content="Are you sure?"),
]
response = llama_chat_format.format_deepseek_r1(messages=messages)

bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
user_tag = "<\uff5cUser\uff5c>"
assistant_tag = "<\uff5cAssistant\uff5c>"

# The thinking content should be stripped, only "x = 2" remains
expected = (
f"{bos}"
f"{user_tag}Solve x+1=3"
f"{assistant_tag}x = 2{eos}"
f"{user_tag}Are you sure?"
f"{assistant_tag}"
)
assert response.prompt == expected


def test_deepseek_r1_distill_aliases():
"""Test that distilled model aliases produce the same output as the base format."""
messages = [
llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
]
base = llama_chat_format.format_deepseek_r1(messages=messages)
qwen = llama_chat_format.format_deepseek_r1_distill_qwen(messages=messages)
llama_variant = llama_chat_format.format_deepseek_r1_distill_llama(messages=messages)

assert base.prompt == qwen.prompt
assert base.prompt == llama_variant.prompt
assert base.stop == qwen.stop == llama_variant.stop
assert base.added_special == qwen.added_special == llama_variant.added_special


def test_guess_chat_format_deepseek_r1_exact_match():
"""Test auto-detection via exact template match."""
metadata = {"tokenizer.chat_template": DEEPSEEK_R1_CHAT_TEMPLATE}
assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"


def test_guess_chat_format_deepseek_r1_heuristic():
"""Test auto-detection via heuristic token presence."""
# A template that contains the DeepSeek tokens but isn't an exact match
fake_template = "some preamble <\uff5cUser\uff5c> stuff <\uff5cAssistant\uff5c> more stuff"
metadata = {"tokenizer.chat_template": fake_template}
assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"


def test_guess_chat_format_no_match():
"""Test that unrecognized templates return None."""
metadata = {"tokenizer.chat_template": "some unknown template"}
assert guess_chat_format_from_gguf_metadata(metadata) is None


def test_guess_chat_format_no_template():
"""Test that missing chat_template returns None."""
metadata = {}
assert guess_chat_format_from_gguf_metadata(metadata) is None
2 changes: 1 addition & 1 deletion vendor/llama.cpp