From aea0ea69cd3e635b4b9277b885d78d5a3bf3420d Mon Sep 17 00:00:00 2001
From: ljluestc <jlin223@jh.edu>
Date: Sun, 1 Mar 2026 10:36:58 -0800
Subject: [PATCH 1/2] feat: Add DeepSeek R1 and distilled model support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update llama.cpp submodule to latest (b8184) for full DeepSeek R1/V2/V3 architecture support
- Add 'deepseek-r1' chat format with correct special tokens (<｜User｜>, <｜Assistant｜>, <｜begin▁of▁sentence｜>, <｜end▁of▁sentence｜>)
- Add 'deepseek-r1-distill-qwen' and 'deepseek-r1-distill-llama' chat format aliases for distilled model variants
- Add DEEPSEEK_R1_CHAT_TEMPLATE constant from official HuggingFace tokenizer config
- Update guess_chat_format_from_gguf_metadata() to auto-detect DeepSeek R1 models via template matching and heuristic token detection
- Handle </think> reasoning content stripping for multi-turn conversations
- Bump version to 0.3.17

Closes #1952
---
 llama_cpp/__init__.py          |  2 +-
 llama_cpp/llama_chat_format.py | 63 ++++++++++++++++++++++++++++++++++
 vendor/llama.cpp               |  2 +-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..a7c40478b 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.16"
+__version__ = "0.3.17"
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f738ab9bb..eb47837a2 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -55,6 +55,11 @@
 # Source: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
 LLAMA3_INSTRUCT_CHAT_TEMPLATE = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
 
+# Source: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/tokenizer_config.json
+DEEPSEEK_R1_CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{\"<\uff5cUser\uff5c>\" + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>' + message['content'] + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<\uff5cAssistant\uff5c>' + content + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<\uff5cAssistant\uff5c>'}}{% endif %}"
+DEEPSEEK_R1_BOS_TOKEN = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+DEEPSEEK_R1_EOS_TOKEN = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+
 ### Chat Completion Handler ###
 
 
@@ -807,6 +812,14 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
     if metadata["tokenizer.chat_template"] == LLAMA3_INSTRUCT_CHAT_TEMPLATE:
         return "llama-3"
 
+    if metadata["tokenizer.chat_template"] == DEEPSEEK_R1_CHAT_TEMPLATE:
+        return "deepseek-r1"
+
+    # Heuristic: detect DeepSeek R1 models by checking for characteristic tokens
+    chat_template = metadata["tokenizer.chat_template"]
+    if "<\uff5cUser\uff5c>" in chat_template and "<\uff5cAssistant\uff5c>" in chat_template:
+        return "deepseek-r1"
+
     return None
 
 
@@ -1395,6 +1408,56 @@ def format_gemma(
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
+# Chat format for DeepSeek R1 and distilled models, see more details:
+# https://huggingface.co/deepseek-ai/DeepSeek-R1
+@register_chat_format("deepseek-r1")
+def format_deepseek_r1(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    _bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+    _eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+    _role_user = "<\uff5cUser\uff5c>"
+    _role_assistant = "<\uff5cAssistant\uff5c>"
+
+    system_message = _get_system_message(messages)
+    _prompt = _bos + system_message
+
+    for message in messages:
+        role = message["role"]
+        content = message.get("content", "") or ""
+        if role == "user" and isinstance(content, str):
+            _prompt += _role_user + content
+        elif role == "assistant" and isinstance(content, str):
+            # Strip thinking content for multi-turn context
+            if "</think>" in content:
+                content = content.split("</think>")[-1]
+            _prompt += _role_assistant + content + _eos
+
+    _prompt += _role_assistant
+    return ChatFormatterResponse(prompt=_prompt, stop=_eos)
+
+
+# Chat format for DeepSeek R1 distilled models (Qwen-based)
+# Uses the same template as DeepSeek R1
+@register_chat_format("deepseek-r1-distill-qwen")
+def format_deepseek_r1_distill_qwen(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    return format_deepseek_r1(messages, **kwargs)
+
+
+# Chat format for DeepSeek R1 distilled models (Llama-based)
+# Uses the same template as DeepSeek R1
+@register_chat_format("deepseek-r1-distill-llama")
+def format_deepseek_r1_distill_llama(
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    **kwargs: Any,
+) -> ChatFormatterResponse:
+    return format_deepseek_r1(messages, **kwargs)
+
+
 # Tricky chat formats that require custom chat handlers
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..319146247 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit 319146247e643695f94a558e8ae686277dd4f8da

From 8764ab502446db8c21e62c9f02cf5e1459ad16f7 Mon Sep 17 00:00:00 2001
From: ljluestc <jlin223@jh.edu>
Date: Sun, 1 Mar 2026 12:22:34 -0800
Subject: [PATCH 2/2] fix: set added_special=True in DeepSeek R1 format to
 prevent double BOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The format_deepseek_r1 function already includes the BOS token
(<｜begin▁of▁sentence｜>) in the formatted prompt, but was not setting
added_special=True in the ChatFormatterResponse. This caused
chat_formatter_to_chat_completion_handler to pass add_bos=True to the
tokenizer, resulting in a duplicate BOS token.

Also adds comprehensive tests for:
- Single-turn and multi-turn conversations
- System message handling
- </think> reasoning content stripping
- Distilled model aliases (qwen/llama)
- Auto-detection via exact match and heuristic
---
 llama_cpp/llama_chat_format.py  |   2 +-
 tests/test_llama_chat_format.py | 156 +++++++++++++++++++++++++++++++-
 2 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index eb47837a2..5c5785973 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1435,7 +1435,7 @@ def format_deepseek_r1(
             _prompt += _role_assistant + content + _eos
 
     _prompt += _role_assistant
-    return ChatFormatterResponse(prompt=_prompt, stop=_eos)
+    return ChatFormatterResponse(prompt=_prompt, stop=_eos, added_special=True)
 
 
 # Chat format for DeepSeek R1 distilled models (Qwen-based)
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b..8f6b0fe3c 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -1,14 +1,34 @@
 import json
+import sys
+import logging
+import ctypes
+from unittest.mock import MagicMock
 
 import jinja2
 
-from llama_cpp import (
-    ChatCompletionRequestUserMessage,
-)
+# Stub the native C library and dependent modules so tests can run
+# without compiling llama.cpp
+_mock_llama_cpp = MagicMock()
+_mock_llama_cpp.llama_log_callback = lambda f: f  # decorator passthrough
+_mock_llama_cpp.llama_log_set = MagicMock()
+sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)
+
+_mock_llama = MagicMock()
+_mock_llama.StoppingCriteriaList = list
+_mock_llama.LogitsProcessorList = list
+_mock_llama.LlamaGrammar = MagicMock
+sys.modules.setdefault("llama_cpp.llama", _mock_llama)
+
 import llama_cpp.llama_types as llama_types
 import llama_cpp.llama_chat_format as llama_chat_format
 
-from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
+from llama_cpp.llama_chat_format import (
+    hf_tokenizer_config_to_chat_formatter,
+    guess_chat_format_from_gguf_metadata,
+    DEEPSEEK_R1_CHAT_TEMPLATE,
+)
+
+ChatCompletionRequestUserMessage = llama_types.ChatCompletionRequestUserMessage
 
 def test_mistral_instruct():
     chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
@@ -87,3 +107,131 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
     )
 
     assert chat_formatter_respoonse.prompt == ("<s>[INST] Hello, world! [/INST]</s>" "")
+
+
+def test_deepseek_r1_single_turn():
+    """Test DeepSeek R1 format with a single user message."""
+    messages = [
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
+    ]
+    response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+    bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+    eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+    user_tag = "<\uff5cUser\uff5c>"
+    assistant_tag = "<\uff5cAssistant\uff5c>"
+
+    expected = f"{bos}{user_tag}Hello{assistant_tag}"
+    assert response.prompt == expected
+    assert response.stop == eos
+    assert response.added_special is True
+
+
+def test_deepseek_r1_with_system_message():
+    """Test DeepSeek R1 format with a system message."""
+    messages = [
+        llama_types.ChatCompletionRequestSystemMessage(role="system", content="You are a helpful assistant."),
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="Hi"),
+    ]
+    response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+    bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+    eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+    user_tag = "<\uff5cUser\uff5c>"
+    assistant_tag = "<\uff5cAssistant\uff5c>"
+
+    expected = f"{bos}You are a helpful assistant.{user_tag}Hi{assistant_tag}"
+    assert response.prompt == expected
+
+
+def test_deepseek_r1_multi_turn():
+    """Test DeepSeek R1 format with multi-turn conversation."""
+    messages = [
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="What is 2+2?"),
+        llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="4"),
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="And 3+3?"),
+    ]
+    response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+    bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+    eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+    user_tag = "<\uff5cUser\uff5c>"
+    assistant_tag = "<\uff5cAssistant\uff5c>"
+
+    expected = (
+        f"{bos}"
+        f"{user_tag}What is 2+2?"
+        f"{assistant_tag}4{eos}"
+        f"{user_tag}And 3+3?"
+        f"{assistant_tag}"
+    )
+    assert response.prompt == expected
+
+
+def test_deepseek_r1_think_stripping():
+    """Test that </think> reasoning content is stripped from assistant messages in multi-turn."""
+    messages = [
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="Solve x+1=3"),
+        llama_types.ChatCompletionRequestAssistantMessage(
+            role="assistant",
+            content="<think>Let me solve this step by step. x+1=3, so x=2.</think>x = 2",
+        ),
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="Are you sure?"),
+    ]
+    response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+    bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+    eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+    user_tag = "<\uff5cUser\uff5c>"
+    assistant_tag = "<\uff5cAssistant\uff5c>"
+
+    # The thinking content should be stripped, only "x = 2" remains
+    expected = (
+        f"{bos}"
+        f"{user_tag}Solve x+1=3"
+        f"{assistant_tag}x = 2{eos}"
+        f"{user_tag}Are you sure?"
+        f"{assistant_tag}"
+    )
+    assert response.prompt == expected
+
+
+def test_deepseek_r1_distill_aliases():
+    """Test that distilled model aliases produce the same output as the base format."""
+    messages = [
+        llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
+    ]
+    base = llama_chat_format.format_deepseek_r1(messages=messages)
+    qwen = llama_chat_format.format_deepseek_r1_distill_qwen(messages=messages)
+    llama_variant = llama_chat_format.format_deepseek_r1_distill_llama(messages=messages)
+
+    assert base.prompt == qwen.prompt
+    assert base.prompt == llama_variant.prompt
+    assert base.stop == qwen.stop == llama_variant.stop
+    assert base.added_special == qwen.added_special == llama_variant.added_special
+
+
+def test_guess_chat_format_deepseek_r1_exact_match():
+    """Test auto-detection via exact template match."""
+    metadata = {"tokenizer.chat_template": DEEPSEEK_R1_CHAT_TEMPLATE}
+    assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"
+
+
+def test_guess_chat_format_deepseek_r1_heuristic():
+    """Test auto-detection via heuristic token presence."""
+    # A template that contains the DeepSeek tokens but isn't an exact match
+    fake_template = "some preamble <\uff5cUser\uff5c> stuff <\uff5cAssistant\uff5c> more stuff"
+    metadata = {"tokenizer.chat_template": fake_template}
+    assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"
+
+
+def test_guess_chat_format_no_match():
+    """Test that unrecognized templates return None."""
+    metadata = {"tokenizer.chat_template": "some unknown template"}
+    assert guess_chat_format_from_gguf_metadata(metadata) is None
+
+
+def test_guess_chat_format_no_template():
+    """Test that missing chat_template returns None."""
+    metadata = {}
+    assert guess_chat_format_from_gguf_metadata(metadata) is None