feat: enable Anthropic prompt caching on system prompt and tools (#69)

XciD · web-flow · commit 5d357ba56e68 · 2026-04-23T09:05:12.000+03:00
* feat: enable Anthropic prompt caching on system prompt and tools

Mark the rendered system prompt and the tool block with cache_control
breakpoints when calling Anthropic models. The static prefix (~4-5K
tokens of system prompt + 15+ tool definitions) was being re-billed at
full input rate on every turn, every retry, and every research
sub-agent iteration (up to 60 per task).

With ephemeral cache breakpoints, subsequent turns within the 5-minute
TTL are billed at cache-read pricing (~10% of input cost). Expected
savings: 40-50% input tokens on multi-turn conversations, 60-80% on
research sub-agent loops.

Caching is GA in the Anthropic API and natively supported by litellm
1.83+ via cache_control blocks (no beta header required). Non-Anthropic
models (HF router, OpenAI) are passed through unchanged.

The helper does not mutate the caller's message list or tool list, so
the persisted ContextManager.items history stays in its original
string-content form.

* refactor: hoist prompt_caching imports to module level, drop cached_ prefix
diff --git a/agent/context_manager/manager.py b/agent/context_manager/manager.py
@@ -13,6 +13,8 @@
 from jinja2 import Template
 from litellm import Message, acompletion
 
+from agent.core.prompt_caching import with_prompt_caching
+
 logger = logging.getLogger(__name__)
 
 _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
@@ -114,6 +116,9 @@ async def summarize_messages(
 
     prompt_messages = list(messages) + [Message(role="user", content=prompt)]
     llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
+    prompt_messages, tool_specs = with_prompt_caching(
+        prompt_messages, tool_specs, llm_params.get("model")
+    )
     response = await acompletion(
         messages=prompt_messages,
         max_completion_tokens=max_tokens,
diff --git a/agent/core/agent_loop.py b/agent/core/agent_loop.py
@@ -14,6 +14,7 @@
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
@@ -296,6 +297,7 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
     _healed_effort = False  # one-shot safety net per call
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
@@ -390,6 +392,7 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
     _healed_effort = False
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
diff --git a/agent/core/prompt_caching.py b/agent/core/prompt_caching.py
@@ -0,0 +1,59 @@
+"""Anthropic prompt caching breakpoints for outgoing LLM requests.
+
+Caching is GA on Anthropic's API and natively supported by litellm >=1.83
+via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
+
+  1. The tool block — caches all tool definitions as a single prefix.
+  2. The system message — caches the rendered system prompt.
+
+Together these cover the ~4-5K static tokens that were being re-billed on
+every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
+(~10% of input cost) instead of full input.
+
+Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
+"""
+
+from typing import Any
+
+
+def with_prompt_caching(
+    messages: list[Any],
+    tools: list[dict] | None,
+    model_name: str | None,
+) -> tuple[list[Any], list[dict] | None]:
+    """Return (messages, tools) with cache_control breakpoints for Anthropic.
+
+    No-op for non-Anthropic models. Original objects are not mutated; a fresh
+    list with replaced first message and last tool is returned, so callers
+    that share the underlying ``ContextManager.items`` list don't see their
+    persisted history rewritten.
+    """
+    if not model_name or not model_name.startswith("anthropic/"):
+        return messages, tools
+
+    if tools:
+        new_tools = list(tools)
+        last = dict(new_tools[-1])
+        last["cache_control"] = {"type": "ephemeral"}
+        new_tools[-1] = last
+        tools = new_tools
+
+    if messages:
+        first = messages[0]
+        role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
+        if role == "system":
+            content = (
+                first.get("content")
+                if isinstance(first, dict)
+                else getattr(first, "content", None)
+            )
+            if isinstance(content, str) and content:
+                cached_block = [{
+                    "type": "text",
+                    "text": content,
+                    "cache_control": {"type": "ephemeral"},
+                }]
+                new_first = {"role": "system", "content": cached_block}
+                messages = [new_first] + list(messages[1:])
+
+    return messages, tools
diff --git a/agent/tools/research_tool.py b/agent/tools/research_tool.py
@@ -15,6 +15,7 @@
 
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event
 
 logger = logging.getLogger(__name__)
@@ -323,8 +324,9 @@ async def _log(text: str) -> None:
                 ),
             ))
             try:
+                _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
                 response = await acompletion(
-                    messages=messages,
+                    messages=_msgs,
                     tools=None,  # no tools — force text response
                     stream=False,
                     timeout=120,
@@ -348,9 +350,12 @@ async def _log(text: str) -> None:
             ))
 
         try:
+            _msgs, _tools = with_prompt_caching(
+                messages, tool_specs if tool_specs else None, llm_params.get("model")
+            )
             response = await acompletion(
-                messages=messages,
-                tools=tool_specs if tool_specs else None,
+                messages=_msgs,
+                tools=_tools,
                 tool_choice="auto",
                 stream=False,
                 timeout=120,
@@ -446,8 +451,9 @@ async def _log(text: str) -> None:
         ),
     ))
     try:
+        _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
         response = await acompletion(
-            messages=messages,
+            messages=_msgs,
             tools=None,
             stream=False,
             timeout=120,