cdnwetzel
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎docs/templates.md
+52 b/‎docs/templates.md
+52
diff --git a/‎llama_cpp/_internals.py
-2 b/‎llama_cpp/_internals.py
-2
diff --git a/‎llama_cpp/llama.py
+12-8 b/‎llama_cpp/llama.py
+12-8
diff --git a/‎llama_cpp/llama_cache.py
+8-12 b/‎llama_cpp/llama_cache.py
+8-12
diff --git a/‎llama_cpp/llama_jinja_format.py
+138 b/‎llama_cpp/llama_jinja_format.py
+138
diff --git a/‎llama_cpp/server/app.py
+57-2 b/‎llama_cpp/server/app.py
+57-2
@@ -14,6 +14,7 @@ This package provides:
 - High-level Python API for text completion
     - OpenAI-like API
     - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
+    - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
 - OpenAI compatible web server
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 
@@ -0,0 +1,52 @@
+# Templates
+
+This document provides a comprehensive guide to the integration of Jinja2 templating into the `llama-cpp-python` project, with a focus on enhancing the chat functionality of the `llama-2` model.
+
+## Introduction
+
+- Brief explanation of the `llama-cpp-python` project's need for a templating system.
+- Overview of the `llama-2` model's interaction with templating.
+
+## Jinja2 Dependency Integration
+
+- Rationale for choosing Jinja2 as the templating engine.
+  - Compatibility with Hugging Face's `transformers`.
+  - Desire for advanced templating features and simplicity.
+- Detailed steps for adding `jinja2` to `pyproject.toml` for dependency management.
+
+## Template Management Refactor
+
+- Summary of the refactor and the motivation behind it.
+- Description of the new chat handler selection logic:
+  1. Preference for a user-specified `chat_handler`.
+  2. Fallback to a user-specified `chat_format`.
+  3. Defaulting to a chat format from a `.gguf` file if available.
+  4. Utilizing the `llama2` default chat format as the final fallback.
+- Ensuring backward compatibility throughout the refactor.
+
+## Implementation Details
+
+- In-depth look at the new `AutoChatFormatter` class.
+- Example code snippets showing how to utilize the Jinja2 environment and templates.
+- Guidance on how to provide custom templates or use defaults.
+
+## Testing and Validation
+
+- Outline of the testing strategy to ensure seamless integration.
+- Steps for validating backward compatibility with existing implementations.
+
+## Benefits and Impact
+
+- Analysis of the expected benefits, including consistency, performance gains, and improved developer experience.
+- Discussion of the potential impact on current users and contributors.
+
+## Future Work
+
+- Exploration of how templating can evolve within the project.
+- Consideration of additional features or optimizations for the templating engine.
+- Mechanisms for community feedback on the templating system.
+
+## Conclusion
+
+- Final thoughts on the integration of Jinja2 templating.
+- Call to action for community involvement and feedback.
@@ -26,7 +26,6 @@
 
 class _LlamaModel:
     """Intermediate Python wrapper for a llama.cpp llama_model.
-
     NOTE: For stability it's recommended you use the Llama class instead."""
 
     _llama_free_model = None
@@ -213,7 +212,6 @@ def default_params():
 
 class _LlamaContext:
     """Intermediate Python wrapper for a llama.cpp llama_context.
-
     NOTE: For stability it's recommended you use the Llama class instead."""
 
     _llama_free = None
 
@@ -5,7 +5,6 @@
 import uuid
 import time
 import multiprocessing
-
 from typing import (
     List,
     Optional,
@@ -25,18 +24,23 @@
 
 from .llama_types import *
 from .llama_grammar import LlamaGrammar
-from .llama_cache import BaseLlamaCache
-
+from .llama_cache import (
+    BaseLlamaCache,
+    LlamaCache,  # type: ignore
+    LlamaDiskCache,  # type: ignore
+    LlamaRAMCache,  # type: ignore
+)
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
 
 from ._internals import (
-    _LlamaModel,
-    _LlamaContext,
-    _LlamaBatch,
+    _LlamaModel,  # type: ignore
+    _LlamaContext,  # type: ignore
+    _LlamaBatch,  # type: ignore
     _LlamaTokenDataArray,  # type: ignore
-    _LlamaSamplingParams,
-    _LlamaSamplingContext,
+    _LlamaSamplingParams,  # type: ignore
+    _LlamaSamplingContext,  # type: ignore
+
 )
 from ._utils import suppress_stdout_stderr
 
 
@@ -1,5 +1,4 @@
 import sys
-
 from abc import ABC, abstractmethod
 from typing import (
     Optional,
@@ -8,8 +7,12 @@
 )
 from collections import OrderedDict
 
+import diskcache
+
 import llama_cpp.llama
 
+from .llama_types import *
+
 
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
@@ -37,9 +40,7 @@ def __contains__(self, key: Sequence[int]) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def __setitem__(
-        self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"
-    ) -> None:
+    def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState") -> None:
         raise NotImplementedError
 
 
@@ -49,9 +50,7 @@ class LlamaRAMCache(BaseLlamaCache):
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[
-            Tuple[int, ...], "llama_cpp.llama.LlamaState"
-        ] = OrderedDict()
+        self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = OrderedDict()
 
     @property
     def cache_size(self):
@@ -64,8 +63,7 @@ def _find_longest_prefix_key(
         min_len = 0
         min_key = None
         keys = (
-            (k, llama_cpp.llama.Llama.longest_token_prefix(k, key))
-            for k in self.cache_state.keys()
+            (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys()
         )
         for k, prefix_len in keys:
             if prefix_len > min_len:
@@ -104,8 +102,6 @@ class LlamaDiskCache(BaseLlamaCache):
     def __init__(
         self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
     ):
-        import diskcache
-
         super().__init__(capacity_bytes)
         self.cache = diskcache.Cache(cache_dir)
 
@@ -131,7 +127,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState":
         _key = self._find_longest_prefix_key(key)
         if _key is None:
             raise KeyError("Key not found")
-        value: "LlamaState" = self.cache.pop(_key)  # type: ignore
+        value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key)  # type: ignore
         # NOTE: This puts an integer as key in cache, which breaks,
         # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
         # self.cache.push(_key, side="front")  # type: ignore
 
@@ -0,0 +1,138 @@
+"""
+llama_cpp/llama_jinja_format.py
+"""
+import dataclasses
+from typing import Any, Callable, Dict, List, Optional, Protocol, Union
+
+import jinja2
+from jinja2 import Template
+
+# NOTE: We sacrifice readability for usability.
+# It will fail to work as expected if we attempt to format it in a readable way.
+llama2_template = """{% for message in messages %}{% if message['role'] == 'user' %}[INST] {{ message['content'] }} [/INST]\n{% elif message['role'] == 'assistant' %}{{ message['content'] }}\n{% elif message['role'] == 'system' %}<<SYS>> {{ message['content'] }} <</SYS>>\n{% endif %}{% endfor %}"""
+
+
+class MetaSingleton(type):
+    """
+    Metaclass for implementing the Singleton pattern.
+    """
+
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+class Singleton(object, metaclass=MetaSingleton):
+    """
+    Base class for implementing the Singleton pattern.
+    """
+
+    def __init__(self):
+        super(Singleton, self).__init__()
+
+
+@dataclasses.dataclass
+class ChatFormatterResponse:
+    prompt: str
+    stop: Optional[Union[str, List[str]]] = None
+
+
+# Base Chat Formatter Protocol
+class ChatFormatterInterface(Protocol):
+    def __init__(self, template: Optional[object] = None):
+        ...
+
+    def __call__(
+        self,
+        messages: List[Dict[str, str]],
+        **kwargs,
+    ) -> ChatFormatterResponse:
+        ...
+
+    @property
+    def template(self) -> str:
+        ...
+
+
+class AutoChatFormatter(ChatFormatterInterface):
+    def __init__(
+        self,
+        template: Optional[str] = None,
+        template_class: Optional[Template] = None,
+    ):
+        if template is not None:
+            self._template = template
+        else:
+            self._template = llama2_template  # default template
+
+        self._environment = jinja2.Environment(
+            loader=jinja2.BaseLoader(),
+            trim_blocks=True,
+            lstrip_blocks=True,
+        ).from_string(
+            self._template,
+            template_class=template_class,
+        )
+
+    def __call__(
+        self,
+        messages: List[Dict[str, str]],
+        **kwargs: Any,
+    ) -> ChatFormatterResponse:
+        formatted_sequence = self._environment.render(messages=messages, **kwargs)
+        return ChatFormatterResponse(prompt=formatted_sequence)
+
+    @property
+    def template(self) -> str:
+        return self._template
+
+
+class FormatterNotFoundException(Exception):
+    pass
+
+
+class ChatFormatterFactory(Singleton):
+    _chat_formatters: Dict[str, Callable[[], ChatFormatterInterface]] = {}
+
+    def register_formatter(
+        self,
+        name: str,
+        formatter_callable: Callable[[], ChatFormatterInterface],
+        overwrite=False,
+    ):
+        if not overwrite and name in self._chat_formatters:
+            raise ValueError(
+                f"Formatter with name '{name}' is already registered. Use `overwrite=True` to overwrite it."
+            )
+        self._chat_formatters[name] = formatter_callable
+
+    def unregister_formatter(self, name: str):
+        if name in self._chat_formatters:
+            del self._chat_formatters[name]
+        else:
+            raise ValueError(f"No formatter registered under the name '{name}'.")
+
+    def get_formatter_by_name(self, name: str) -> ChatFormatterInterface:
+        try:
+            formatter_callable = self._chat_formatters[name]
+            return formatter_callable()
+        except KeyError:
+            raise FormatterNotFoundException(
+                f"Invalid chat format: {name} (valid formats: {list(self._chat_formatters.keys())})"
+            )
+
+
+# Define a chat format class
+class Llama2Formatter(AutoChatFormatter):
+    def __init__(self):
+        super().__init__(llama2_template)
+
+
+# With the Singleton pattern applied, regardless of where or how many times
+# ChatFormatterFactory() is called, it will always return the same instance
+# of the factory, ensuring that the factory's state is consistent throughout
+# the application.
+ChatFormatterFactory().register_formatter("llama-2", Llama2Formatter)
@@ -197,7 +197,36 @@ async def authenticate(
 
 
 @router.post(
-    "/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]
+    "/v1/completions",
+    summary="Completion",
+    dependencies=[Depends(authenticate)],    
+    response_model= Union[
+        llama_cpp.CreateCompletionResponse,
+        str,
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"}                            
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream":{
+                    "schema": {                     
+                      "type": "string",
+                      "title": "Server Side Streaming response, when stream=True. " +
+                        "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                      "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
+                    }
+                }
+            },
+        }
+    },
 )
 @router.post(
     "/v1/engines/copilot-codex/completions",
@@ -280,7 +309,33 @@ async def create_embedding(
 
 
 @router.post(
-    "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]
+    "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)],
+    response_model= Union[
+        llama_cpp.ChatCompletion, str
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateChatCompletionResponse"}                            
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream":{
+                    "schema": {                     
+                      "type": "string",
+                      "title": "Server Side Streaming response, when stream=True" +
+                        "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                      "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
+                    }
+                }
+            },
+        }
+    },
 )
 async def create_chat_completion(
     request: Request,