From aea0ea69cd3e635b4b9277b885d78d5a3bf3420d Mon Sep 17 00:00:00 2001 From: ljluestc Date: Sun, 1 Mar 2026 10:36:58 -0800 Subject: [PATCH 1/2] feat: Add DeepSeek R1 and distilled model support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update llama.cpp submodule to latest (b8184) for full DeepSeek R1/V2/V3 architecture support - Add 'deepseek-r1' chat format with correct special tokens (<|User|>, <|Assistant|>, <|begin▁of▁sentence|>, <|end▁of▁sentence|>) - Add 'deepseek-r1-distill-qwen' and 'deepseek-r1-distill-llama' chat format aliases for distilled model variants - Add DEEPSEEK_R1_CHAT_TEMPLATE constant from official HuggingFace tokenizer config - Update guess_chat_format_from_gguf_metadata() to auto-detect DeepSeek R1 models via template matching and heuristic token detection - Handle reasoning content stripping for multi-turn conversations - Bump version to 0.3.17 Closes #1952 --- llama_cpp/__init__.py | 2 +- llama_cpp/llama_chat_format.py | 63 ++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index c1dde7046..a7c40478b 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.16" +__version__ = "0.3.17" diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index f738ab9bb..eb47837a2 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -55,6 +55,11 @@ # Source: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json LLAMA3_INSTRUCT_CHAT_TEMPLATE = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +# Source: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/tokenizer_config.json +DEEPSEEK_R1_CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{\"<\uff5cUser\uff5c>\" + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>' + message['content'] + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<\uff5cAssistant\uff5c>' + content + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<\uff5cAssistant\uff5c>'}}{% endif %}" +DEEPSEEK_R1_BOS_TOKEN = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" +DEEPSEEK_R1_EOS_TOKEN = "<\uff5cend\u2581of\u2581sentence\uff5c>" + ### Chat Completion Handler ### @@ -807,6 +812,14 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s if metadata["tokenizer.chat_template"] == LLAMA3_INSTRUCT_CHAT_TEMPLATE: return "llama-3" + if metadata["tokenizer.chat_template"] == DEEPSEEK_R1_CHAT_TEMPLATE: + return "deepseek-r1" + + # Heuristic: detect DeepSeek R1 models by checking for characteristic tokens + chat_template = metadata["tokenizer.chat_template"] + if "<\uff5cUser\uff5c>" in chat_template and "<\uff5cAssistant\uff5c>" in chat_template: + return "deepseek-r1" + return None @@ -1395,6 +1408,56 @@ def format_gemma( return ChatFormatterResponse(prompt=_prompt, stop=_sep) +# Chat format for DeepSeek R1 and distilled models, see more details: +# https://huggingface.co/deepseek-ai/DeepSeek-R1 +@register_chat_format("deepseek-r1") +def format_deepseek_r1( + messages: List[llama_types.ChatCompletionRequestMessage], + **kwargs: Any, +) -> ChatFormatterResponse: + _bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" + _eos = "<\uff5cend\u2581of\u2581sentence\uff5c>" + _role_user = "<\uff5cUser\uff5c>" + _role_assistant = "<\uff5cAssistant\uff5c>" + + system_message = _get_system_message(messages) + _prompt = _bos + system_message + + for message in messages: + role = message["role"] + content = message.get("content", "") or "" + if role == "user" and isinstance(content, str): + _prompt += _role_user + content + elif role == "assistant" and isinstance(content, str): + # Strip thinking content for multi-turn context + if "" in content: + content = content.split("")[-1] + _prompt += _role_assistant + content + _eos + + _prompt += _role_assistant + return ChatFormatterResponse(prompt=_prompt, stop=_eos) + + +# Chat format for DeepSeek R1 distilled models (Qwen-based) +# Uses the same template as DeepSeek R1 +@register_chat_format("deepseek-r1-distill-qwen") +def format_deepseek_r1_distill_qwen( + messages: List[llama_types.ChatCompletionRequestMessage], + **kwargs: Any, +) -> ChatFormatterResponse: + return format_deepseek_r1(messages, **kwargs) + + +# Chat format for DeepSeek R1 distilled models (Llama-based) +# Uses the same template as DeepSeek R1 +@register_chat_format("deepseek-r1-distill-llama") +def format_deepseek_r1_distill_llama( + messages: List[llama_types.ChatCompletionRequestMessage], + **kwargs: Any, +) -> ChatFormatterResponse: + return format_deepseek_r1(messages, **kwargs) + + # Tricky chat formats that require custom chat handlers diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be4..319146247 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit 319146247e643695f94a558e8ae686277dd4f8da From 8764ab502446db8c21e62c9f02cf5e1459ad16f7 Mon Sep 17 00:00:00 2001 From: ljluestc Date: Sun, 1 Mar 2026 12:22:34 -0800 Subject: [PATCH 2/2] fix: set added_special=True in DeepSeek R1 format to prevent double BOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The format_deepseek_r1 function already includes the BOS token (<|begin▁of▁sentence|>) in the formatted prompt, but was not setting added_special=True in the ChatFormatterResponse. This caused chat_formatter_to_chat_completion_handler to pass add_bos=True to the tokenizer, resulting in a duplicate BOS token. Also adds comprehensive tests for: - Single-turn and multi-turn conversations - System message handling - reasoning content stripping - Distilled model aliases (qwen/llama) - Auto-detection via exact match and heuristic --- llama_cpp/llama_chat_format.py | 2 +- tests/test_llama_chat_format.py | 156 +++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index eb47837a2..5c5785973 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1435,7 +1435,7 @@ def format_deepseek_r1( _prompt += _role_assistant + content + _eos _prompt += _role_assistant - return ChatFormatterResponse(prompt=_prompt, stop=_eos) + return ChatFormatterResponse(prompt=_prompt, stop=_eos, added_special=True) # Chat format for DeepSeek R1 distilled models (Qwen-based) diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index f031bf72b..8f6b0fe3c 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -1,14 +1,34 @@ import json +import sys +import logging +import ctypes +from unittest.mock import MagicMock import jinja2 -from llama_cpp import ( - ChatCompletionRequestUserMessage, -) +# Stub the native C library and dependent modules so tests can run +# without compiling llama.cpp +_mock_llama_cpp = MagicMock() +_mock_llama_cpp.llama_log_callback = lambda f: f # decorator passthrough +_mock_llama_cpp.llama_log_set = MagicMock() +sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp) + +_mock_llama = MagicMock() +_mock_llama.StoppingCriteriaList = list +_mock_llama.LogitsProcessorList = list +_mock_llama.LlamaGrammar = MagicMock +sys.modules.setdefault("llama_cpp.llama", _mock_llama) + import llama_cpp.llama_types as llama_types import llama_cpp.llama_chat_format as llama_chat_format -from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter +from llama_cpp.llama_chat_format import ( + hf_tokenizer_config_to_chat_formatter, + guess_chat_format_from_gguf_metadata, + DEEPSEEK_R1_CHAT_TEMPLATE, +) + +ChatCompletionRequestUserMessage = llama_types.ChatCompletionRequestUserMessage def test_mistral_instruct(): chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" @@ -87,3 +107,131 @@ def test_hf_tokenizer_config_str_to_chat_formatter(): ) assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "") + + +def test_deepseek_r1_single_turn(): + """Test DeepSeek R1 format with a single user message.""" + messages = [ + llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"), + ] + response = llama_chat_format.format_deepseek_r1(messages=messages) + + bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" + eos = "<\uff5cend\u2581of\u2581sentence\uff5c>" + user_tag = "<\uff5cUser\uff5c>" + assistant_tag = "<\uff5cAssistant\uff5c>" + + expected = f"{bos}{user_tag}Hello{assistant_tag}" + assert response.prompt == expected + assert response.stop == eos + assert response.added_special is True + + +def test_deepseek_r1_with_system_message(): + """Test DeepSeek R1 format with a system message.""" + messages = [ + llama_types.ChatCompletionRequestSystemMessage(role="system", content="You are a helpful assistant."), + llama_types.ChatCompletionRequestUserMessage(role="user", content="Hi"), + ] + response = llama_chat_format.format_deepseek_r1(messages=messages) + + bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" + eos = "<\uff5cend\u2581of\u2581sentence\uff5c>" + user_tag = "<\uff5cUser\uff5c>" + assistant_tag = "<\uff5cAssistant\uff5c>" + + expected = f"{bos}You are a helpful assistant.{user_tag}Hi{assistant_tag}" + assert response.prompt == expected + + +def test_deepseek_r1_multi_turn(): + """Test DeepSeek R1 format with multi-turn conversation.""" + messages = [ + llama_types.ChatCompletionRequestUserMessage(role="user", content="What is 2+2?"), + llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="4"), + llama_types.ChatCompletionRequestUserMessage(role="user", content="And 3+3?"), + ] + response = llama_chat_format.format_deepseek_r1(messages=messages) + + bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" + eos = "<\uff5cend\u2581of\u2581sentence\uff5c>" + user_tag = "<\uff5cUser\uff5c>" + assistant_tag = "<\uff5cAssistant\uff5c>" + + expected = ( + f"{bos}" + f"{user_tag}What is 2+2?" + f"{assistant_tag}4{eos}" + f"{user_tag}And 3+3?" + f"{assistant_tag}" + ) + assert response.prompt == expected + + +def test_deepseek_r1_think_stripping(): + """Test that reasoning content is stripped from assistant messages in multi-turn.""" + messages = [ + llama_types.ChatCompletionRequestUserMessage(role="user", content="Solve x+1=3"), + llama_types.ChatCompletionRequestAssistantMessage( + role="assistant", + content="Let me solve this step by step. x+1=3, so x=2.x = 2", + ), + llama_types.ChatCompletionRequestUserMessage(role="user", content="Are you sure?"), + ] + response = llama_chat_format.format_deepseek_r1(messages=messages) + + bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>" + eos = "<\uff5cend\u2581of\u2581sentence\uff5c>" + user_tag = "<\uff5cUser\uff5c>" + assistant_tag = "<\uff5cAssistant\uff5c>" + + # The thinking content should be stripped, only "x = 2" remains + expected = ( + f"{bos}" + f"{user_tag}Solve x+1=3" + f"{assistant_tag}x = 2{eos}" + f"{user_tag}Are you sure?" + f"{assistant_tag}" + ) + assert response.prompt == expected + + +def test_deepseek_r1_distill_aliases(): + """Test that distilled model aliases produce the same output as the base format.""" + messages = [ + llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"), + ] + base = llama_chat_format.format_deepseek_r1(messages=messages) + qwen = llama_chat_format.format_deepseek_r1_distill_qwen(messages=messages) + llama_variant = llama_chat_format.format_deepseek_r1_distill_llama(messages=messages) + + assert base.prompt == qwen.prompt + assert base.prompt == llama_variant.prompt + assert base.stop == qwen.stop == llama_variant.stop + assert base.added_special == qwen.added_special == llama_variant.added_special + + +def test_guess_chat_format_deepseek_r1_exact_match(): + """Test auto-detection via exact template match.""" + metadata = {"tokenizer.chat_template": DEEPSEEK_R1_CHAT_TEMPLATE} + assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1" + + +def test_guess_chat_format_deepseek_r1_heuristic(): + """Test auto-detection via heuristic token presence.""" + # A template that contains the DeepSeek tokens but isn't an exact match + fake_template = "some preamble <\uff5cUser\uff5c> stuff <\uff5cAssistant\uff5c> more stuff" + metadata = {"tokenizer.chat_template": fake_template} + assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1" + + +def test_guess_chat_format_no_match(): + """Test that unrecognized templates return None.""" + metadata = {"tokenizer.chat_template": "some unknown template"} + assert guess_chat_format_from_gguf_metadata(metadata) is None + + +def test_guess_chat_format_no_template(): + """Test that missing chat_template returns None.""" + metadata = {} + assert guess_chat_format_from_gguf_metadata(metadata) is None