diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index c1dde7046..a7c40478b 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
-__version__ = "0.3.16"
+__version__ = "0.3.17"
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f738ab9bb..5c5785973 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -55,6 +55,11 @@
# Source: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
LLAMA3_INSTRUCT_CHAT_TEMPLATE = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+# Source: https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/tokenizer_config.json
+DEEPSEEK_R1_CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{\"<\uff5cUser\uff5c>\" + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>' + message['content'] + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<\uff5cAssistant\uff5c>' + content + '<\uff5cend\u2581of\u2581sentence\uff5c>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<\uff5ctool\u2581outputs\u2581end\uff5c>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<\uff5cAssistant\uff5c>'}}{% endif %}"
+DEEPSEEK_R1_BOS_TOKEN = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+DEEPSEEK_R1_EOS_TOKEN = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+
### Chat Completion Handler ###
@@ -807,6 +812,14 @@ def guess_chat_format_from_gguf_metadata(metadata: Dict[str, str]) -> Optional[s
if metadata["tokenizer.chat_template"] == LLAMA3_INSTRUCT_CHAT_TEMPLATE:
return "llama-3"
+ if metadata["tokenizer.chat_template"] == DEEPSEEK_R1_CHAT_TEMPLATE:
+ return "deepseek-r1"
+
+ # Heuristic: detect DeepSeek R1 models by checking for characteristic tokens
+ chat_template = metadata["tokenizer.chat_template"]
+ if "<\uff5cUser\uff5c>" in chat_template and "<\uff5cAssistant\uff5c>" in chat_template:
+ return "deepseek-r1"
+
return None
@@ -1395,6 +1408,56 @@ def format_gemma(
return ChatFormatterResponse(prompt=_prompt, stop=_sep)
+# Chat format for DeepSeek R1 and distilled models, see more details:
+# https://huggingface.co/deepseek-ai/DeepSeek-R1
+@register_chat_format("deepseek-r1")
+def format_deepseek_r1(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ **kwargs: Any,
+) -> ChatFormatterResponse:
+ _bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+ _eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+ _role_user = "<\uff5cUser\uff5c>"
+ _role_assistant = "<\uff5cAssistant\uff5c>"
+
+ system_message = _get_system_message(messages)
+ _prompt = _bos + system_message
+
+ for message in messages:
+ role = message["role"]
+ content = message.get("content", "") or ""
+ if role == "user" and isinstance(content, str):
+ _prompt += _role_user + content
+ elif role == "assistant" and isinstance(content, str):
+ # Strip thinking content for multi-turn context
+ if "" in content:
+ content = content.split("")[-1]
+ _prompt += _role_assistant + content + _eos
+
+ _prompt += _role_assistant
+ return ChatFormatterResponse(prompt=_prompt, stop=_eos, added_special=True)
+
+
+# Chat format for DeepSeek R1 distilled models (Qwen-based)
+# Uses the same template as DeepSeek R1
+@register_chat_format("deepseek-r1-distill-qwen")
+def format_deepseek_r1_distill_qwen(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ **kwargs: Any,
+) -> ChatFormatterResponse:
+ return format_deepseek_r1(messages, **kwargs)
+
+
+# Chat format for DeepSeek R1 distilled models (Llama-based)
+# Uses the same template as DeepSeek R1
+@register_chat_format("deepseek-r1-distill-llama")
+def format_deepseek_r1_distill_llama(
+ messages: List[llama_types.ChatCompletionRequestMessage],
+ **kwargs: Any,
+) -> ChatFormatterResponse:
+ return format_deepseek_r1(messages, **kwargs)
+
+
# Tricky chat formats that require custom chat handlers
diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index f031bf72b..8f6b0fe3c 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -1,14 +1,34 @@
import json
+import sys
+import logging
+import ctypes
+from unittest.mock import MagicMock
import jinja2
-from llama_cpp import (
- ChatCompletionRequestUserMessage,
-)
+# Stub the native C library and dependent modules so tests can run
+# without compiling llama.cpp
+_mock_llama_cpp = MagicMock()
+_mock_llama_cpp.llama_log_callback = lambda f: f # decorator passthrough
+_mock_llama_cpp.llama_log_set = MagicMock()
+sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)
+
+_mock_llama = MagicMock()
+_mock_llama.StoppingCriteriaList = list
+_mock_llama.LogitsProcessorList = list
+_mock_llama.LlamaGrammar = MagicMock
+sys.modules.setdefault("llama_cpp.llama", _mock_llama)
+
import llama_cpp.llama_types as llama_types
import llama_cpp.llama_chat_format as llama_chat_format
-from llama_cpp.llama_chat_format import hf_tokenizer_config_to_chat_formatter
+from llama_cpp.llama_chat_format import (
+ hf_tokenizer_config_to_chat_formatter,
+ guess_chat_format_from_gguf_metadata,
+ DEEPSEEK_R1_CHAT_TEMPLATE,
+)
+
+ChatCompletionRequestUserMessage = llama_types.ChatCompletionRequestUserMessage
def test_mistral_instruct():
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
@@ -87,3 +107,131 @@ def test_hf_tokenizer_config_str_to_chat_formatter():
)
assert chat_formatter_respoonse.prompt == ("[INST] Hello, world! [/INST]" "")
+
+
+def test_deepseek_r1_single_turn():
+ """Test DeepSeek R1 format with a single user message."""
+ messages = [
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
+ ]
+ response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+ bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+ eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+ user_tag = "<\uff5cUser\uff5c>"
+ assistant_tag = "<\uff5cAssistant\uff5c>"
+
+ expected = f"{bos}{user_tag}Hello{assistant_tag}"
+ assert response.prompt == expected
+ assert response.stop == eos
+ assert response.added_special is True
+
+
+def test_deepseek_r1_with_system_message():
+ """Test DeepSeek R1 format with a system message."""
+ messages = [
+ llama_types.ChatCompletionRequestSystemMessage(role="system", content="You are a helpful assistant."),
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="Hi"),
+ ]
+ response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+ bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+ eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+ user_tag = "<\uff5cUser\uff5c>"
+ assistant_tag = "<\uff5cAssistant\uff5c>"
+
+ expected = f"{bos}You are a helpful assistant.{user_tag}Hi{assistant_tag}"
+ assert response.prompt == expected
+
+
+def test_deepseek_r1_multi_turn():
+ """Test DeepSeek R1 format with multi-turn conversation."""
+ messages = [
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="What is 2+2?"),
+ llama_types.ChatCompletionRequestAssistantMessage(role="assistant", content="4"),
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="And 3+3?"),
+ ]
+ response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+ bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+ eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+ user_tag = "<\uff5cUser\uff5c>"
+ assistant_tag = "<\uff5cAssistant\uff5c>"
+
+ expected = (
+ f"{bos}"
+ f"{user_tag}What is 2+2?"
+ f"{assistant_tag}4{eos}"
+ f"{user_tag}And 3+3?"
+ f"{assistant_tag}"
+ )
+ assert response.prompt == expected
+
+
+def test_deepseek_r1_think_stripping():
+ """Test that reasoning content is stripped from assistant messages in multi-turn."""
+ messages = [
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="Solve x+1=3"),
+ llama_types.ChatCompletionRequestAssistantMessage(
+ role="assistant",
+ content="Let me solve this step by step. x+1=3, so x=2.x = 2",
+ ),
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="Are you sure?"),
+ ]
+ response = llama_chat_format.format_deepseek_r1(messages=messages)
+
+ bos = "<\uff5cbegin\u2581of\u2581sentence\uff5c>"
+ eos = "<\uff5cend\u2581of\u2581sentence\uff5c>"
+ user_tag = "<\uff5cUser\uff5c>"
+ assistant_tag = "<\uff5cAssistant\uff5c>"
+
+ # The thinking content should be stripped, only "x = 2" remains
+ expected = (
+ f"{bos}"
+ f"{user_tag}Solve x+1=3"
+ f"{assistant_tag}x = 2{eos}"
+ f"{user_tag}Are you sure?"
+ f"{assistant_tag}"
+ )
+ assert response.prompt == expected
+
+
+def test_deepseek_r1_distill_aliases():
+ """Test that distilled model aliases produce the same output as the base format."""
+ messages = [
+ llama_types.ChatCompletionRequestUserMessage(role="user", content="Hello"),
+ ]
+ base = llama_chat_format.format_deepseek_r1(messages=messages)
+ qwen = llama_chat_format.format_deepseek_r1_distill_qwen(messages=messages)
+ llama_variant = llama_chat_format.format_deepseek_r1_distill_llama(messages=messages)
+
+ assert base.prompt == qwen.prompt
+ assert base.prompt == llama_variant.prompt
+ assert base.stop == qwen.stop == llama_variant.stop
+ assert base.added_special == qwen.added_special == llama_variant.added_special
+
+
+def test_guess_chat_format_deepseek_r1_exact_match():
+ """Test auto-detection via exact template match."""
+ metadata = {"tokenizer.chat_template": DEEPSEEK_R1_CHAT_TEMPLATE}
+ assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"
+
+
+def test_guess_chat_format_deepseek_r1_heuristic():
+ """Test auto-detection via heuristic token presence."""
+ # A template that contains the DeepSeek tokens but isn't an exact match
+ fake_template = "some preamble <\uff5cUser\uff5c> stuff <\uff5cAssistant\uff5c> more stuff"
+ metadata = {"tokenizer.chat_template": fake_template}
+ assert guess_chat_format_from_gguf_metadata(metadata) == "deepseek-r1"
+
+
+def test_guess_chat_format_no_match():
+ """Test that unrecognized templates return None."""
+ metadata = {"tokenizer.chat_template": "some unknown template"}
+ assert guess_chat_format_from_gguf_metadata(metadata) is None
+
+
+def test_guess_chat_format_no_template():
+ """Test that missing chat_template returns None."""
+ metadata = {}
+ assert guess_chat_format_from_gguf_metadata(metadata) is None
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..319146247 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit 319146247e643695f94a558e8ae686277dd4f8da