tensorchord · kemingy · May 16, 2025 · May 9, 2025 · May 13, 2025 · May 16, 2025
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@
 [![GitHub License][license-badge]][license-link]
 [![PyPI - Version][pypi-badge]][pypi-link]
 [![Discord][discord-badge]][discord-link]
+[![Blog][blog-badge]][blog-link]
 
 </p>
 <p><em>Turn PostgreSQL into your search engine in a Pythonic way.</em></p>
@@ -46,7 +47,8 @@ The related Docker images can be found in [VectorChord Suite][vectorchord-suite]
 - [beir.py](examples/beir.py): the most flexible way to use the library (loading, indexing, querying and evaluation)
 - [web.py](examples/web.py): build a web application with from the defined tables and pipeline
 - [essay.py](examples/essay.py): extract the content from Paul Graham's essays and evaluate the search results from LLM generated queries
-- [contextual.py](examples/contextual.py): contextual retrieval example
+- [contextual.py](examples/contextual.py): contextual retrieval example with local PDF
+- [anthropic.py](examples/anthropic.py): contextual retrieval with the Anthropic's Tutorial example
 - [hybrid.py](examples/hybrid.py): hybrid search that rerank the results from vector search with keyword search
 
 ## User Guide
@@ -189,3 +191,5 @@ make format
 [discord-badge]: https://img.shields.io/discord/974584200327991326?&logoColor=white&color=5865F2&style=flat&logo=discord&cacheSeconds=60
 [discord-link]: https://discord.gg/KqswhpVgdU
 [vectorchord-suite]: https://github.com/tensorchord/VectorChord-images
+[blog-badge]: https://img.shields.io/badge/VectorChrod-Blog-DAFDBA
+[blog-link]: https://blog.vectorchord.ai/
diff --git a/docs/source/example.md b/docs/source/example.md
@@ -24,6 +24,12 @@
 :code: python
 ```
 
+## Contextual retrieval with the Anthropic example
+
+```{include} ../../examples/anthropic.py
+:code: python
+```
+
 ## Evaluate with generated queries
 
 ```{include} ../../examples/essay.py

diff --git a/docs/source/utils.md b/docs/source/utils.md
@@ -1,6 +1,6 @@
 # Toolkit
 
-We provides some basic tools to help you build the RAG pipeline. But it's not limited to thses
+We provides some basic tools to help you build the RAG pipeline. But it's not limited to these
 internal tools. You can use whatever you like.
 
 You may need to install with extras:

diff --git a/examples/anthropic.py b/examples/anthropic.py
@@ -0,0 +1,239 @@
+"""Anthropic Cookbook Contextual Embedding Example.
+
+Data can be found from "https://github.com/anthropics/anthropic-cookbook".
+"""
+
+import json
+from pathlib import Path
+from time import perf_counter
+from typing import Annotated, Optional
+
+import httpx
+
+from vechord.augment import GeminiAugmenter
+from vechord.embedding import GeminiDenseEmbedding
+from vechord.registry import VechordRegistry
+from vechord.rerank import CohereReranker, ReciprocalRankFusion
+from vechord.spec import (
+    ForeignKey,
+    Keyword,
+    PrimaryKeyAutoIncrease,
+    Table,
+    UniqueIndex,
+    Vector,
+)
+
+DenseVector = Vector[768]
+emb = GeminiDenseEmbedding()
+vr = VechordRegistry("anthropic", "postgresql://postgres:[email protected]:5432/")
+
+
+class Document(Table, kw_only=True):
+    uid: Optional[PrimaryKeyAutoIncrease] = None
+    uuid: Annotated[str, UniqueIndex()]
+    content: str
+
+
+class Chunk(Table, kw_only=True):
+    uid: Optional[PrimaryKeyAutoIncrease] = None
+    doc_uuid: Annotated[str, ForeignKey[Document.uuid]]
+    index: int
+    content: str
+    vector: DenseVector
+    keyword: Keyword
+
+
+class ContextualChunk(Table, kw_only=True):
+    uid: Optional[PrimaryKeyAutoIncrease] = None
+    doc_uuid: Annotated[str, ForeignKey[Document.uuid]]
+    index: int
+    content: str
+    context: str
+    vector: DenseVector
+    keyword: Keyword
+
+
+class Query(Table, kw_only=True):
+    uid: Optional[PrimaryKeyAutoIncrease] = None
+    content: str
+    answer: str
+    doc_uuids: list[str]
+    chunk_index: list[int]
+    vector: DenseVector
+
+
+vr.register([Document, Chunk, ContextualChunk, Query])
+
+
+def download_data(url: str, save_path: str):
+    if Path(save_path).is_file():
+        print(f"{save_path} already exists, skip download.")
+        return
+    with httpx.stream("GET", url) as response, open(save_path, "wb") as f:
+        for chunk in response.iter_bytes():
+            f.write(chunk)
+
+
+def load_data(filepath: str):
+    with open(filepath, "r", encoding="utf-8") as f:
+        docs = json.load(f)
+        for doc in docs:
+            vr.insert(
+                Document(
+                    uuid=doc["original_uuid"],
+                    content=doc["content"],
+                )
+            )
+            for chunk in doc["chunks"]:
+                vr.insert(
+                    Chunk(
+                        doc_uuid=doc["original_uuid"],
+                        index=chunk["original_index"],
+                        content=chunk["content"],
+                        vector=emb.vectorize_chunk(chunk["content"]),
+                        keyword=Keyword(chunk["content"]),
+                    )
+                )
+
+
+def load_contextual_chunks(filepath: str):
+    augmenter = GeminiAugmenter()
+
+    with open(filepath, "r", encoding="utf-8") as f:
+        docs = json.load(f)
+        for doc in docs:
+            augmenter.reset(doc["content"])
+            chunks = doc["chunks"]
+            augments = augmenter.augment_context([chunk["content"] for chunk in chunks])
+            if len(augments) != len(chunks):
+                print(
+                    f"augments length not match for uuid: {doc['original_uuid']}, {len(augments)} != {len(chunks)}"
+                )
+            for chunk, context in zip(chunks, augments, strict=False):
+                contextual_content = f"{chunk['content']}\n\n{context}"
+                vr.insert(
+                    ContextualChunk(
+                        doc_uuid=doc["original_uuid"],
+                        index=chunk["original_index"],
+                        content=chunk["content"],
+                        context=context,
+                        vector=emb.vectorize_chunk(contextual_content),
+                        keyword=Keyword(contextual_content),
+                    )
+                )
+
+
+def load_query(filepath: str):
+    queries = []
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line in f:
+            query = json.loads(line)
+            queries.append(
+                Query(
+                    content=query["query"],
+                    answer=query["answer"],
+                    doc_uuids=[x[0] for x in query["golden_chunk_uuids"]],
+                    chunk_index=[x[1] for x in query["golden_chunk_uuids"]],
+                    vector=emb.vectorize_query(query["query"]),
+                )
+            )
+    vr.copy_bulk(queries)
+
+
+def vector_search(query: Query, topk: int) -> list[Chunk]:
+    return vr.search_by_vector(Chunk, query.vector, topk=topk)
+
+
+def vector_contextual_search(query: Query, topk: int) -> list[ContextualChunk]:
+    return vr.search_by_vector(ContextualChunk, query.vector, topk=topk)
+
+
+def keyword_search(query: Query, topk: int) -> list[Chunk]:
+    return vr.search_by_keyword(Chunk, query.content, topk=topk)
+
+
+def keyword_contextual_search(query: Query, topk: int) -> list[ContextualChunk]:
+    return vr.search_by_keyword(ContextualChunk, query.content, topk=topk)
+
+
+def hybrid_search_fuse(query: Query, topk: int) -> list[Chunk]:
+    rrf = ReciprocalRankFusion()
+    return rrf.fuse([vector_search(query, topk), keyword_search(query, topk)])[:topk]
+
+
+def hybrid_contextual_search_fuse(query: Query, topk: int) -> list[ContextualChunk]:
+    rrf = ReciprocalRankFusion()
+    return rrf.fuse(
+        [vector_contextual_search(query, topk), keyword_contextual_search(query, topk)]
+    )[:topk]
+
+
+def hybrid_search_rerank(query: Query, topk: int, boost=3) -> list[Chunk]:
+    ranker = CohereReranker()
+    vecs = vector_search(query, topk * boost)
+    keys = keyword_search(query, topk * boost)
+    chunks = list({chunk.uid: chunk for chunk in vecs + keys}.values())
+    indices = ranker.rerank(query.content, [chunk.content for chunk in chunks])
+    return [chunks[i] for i in indices[:topk]]
+
+
+def hybrid_contextual_search_rerank(
+    query: Query, topk: int, boost=3
+) -> list[ContextualChunk]:
+    ranker = CohereReranker()
+    vecs = vector_contextual_search(query, topk * boost)
+    keys = keyword_contextual_search(query, topk * boost)
+    chunks = list({chunk.uid: chunk for chunk in vecs + keys}.values())
+    indices = ranker.rerank(
+        query.content, [f"{chunk.content}\n{chunk.context}" for chunk in chunks]
+    )
+    return [chunks[i] for i in indices[:topk]]
+
+
+def evaluate(topk=5, search_func=vector_search):
+    print(f"TopK={topk}, search by: {search_func.__name__}")
+    queries: list[Query] = vr.select_by(Query.partial_init())
+    total_score = 0
+    start = perf_counter()
+    for query in queries:
+        chunks: list[Chunk] = search_func(query, topk)
+        count = 0
+        for doc_uuid, chunk_index in zip(
+            query.doc_uuids, query.chunk_index, strict=True
+        ):
+            for chunk in chunks:
+                if chunk.doc_uuid == doc_uuid and chunk.index == chunk_index:
+                    count += 1
+                    break
+        score = count / len(query.doc_uuids)
+        total_score += score
+
+    print(
+        f"Pass@{topk}: {total_score / len(queries):.4f}, total queries: {len(queries)}, QPS: {len(queries) / (perf_counter() - start):.3f}"
+    )
+
+
+if __name__ == "__main__":
+    Path("datasets").mkdir(parents=True, exist_ok=True)
+    download_data(
+        "https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/codebase_chunks.json",
+        "datasets/codebase_chunks.json",
+    )
+    download_data(
+        "https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/evaluation_set.jsonl",
+        "datasets/evaluation_set.jsonl",
+    )
+    load_data("datasets/codebase_chunks.json")
+    load_query("datasets/evaluation_set.jsonl")
+    load_contextual_chunks("datasets/codebase_chunks.json")
+
+    for topk in [5, 10]:
+        print("=" * 50)
+        evaluate(topk=topk, search_func=vector_search)
+        evaluate(topk=topk, search_func=keyword_search)
+        evaluate(topk=topk, search_func=hybrid_search_fuse)
+        evaluate(topk=topk, search_func=hybrid_search_rerank)
+        evaluate(topk=topk, search_func=vector_contextual_search)
+        evaluate(topk=topk, search_func=keyword_contextual_search)
+        evaluate(topk=topk, search_func=hybrid_contextual_search_fuse)
+        evaluate(topk=topk, search_func=hybrid_contextual_search_rerank)
diff --git a/tests/test_table.py b/tests/test_table.py
@@ -6,6 +6,7 @@
 import msgspec
 import numpy as np
 import pytest
+from psycopg.errors import UniqueViolation
 from psycopg.types.json import Jsonb
 
 from vechord.log import logger
@@ -17,6 +18,7 @@
     PrimaryKeyAutoIncrease,
     PrimaryKeyUUID,
     Table,
+    UniqueIndex,
     Vector,
     VectorIndex,
 )
@@ -113,6 +115,27 @@ def test_annotated_index(registry):
     assert len(res) == topk
 
 
+@pytest.mark.db
+def test_unique_index(registry):
+    class UniqueTable(Table, kw_only=True):
+        uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+        sid: Annotated[str, UniqueIndex()]
+
+    class SubTable(Table, kw_only=True):
+        uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
+        text: str
+        foreign_key: Annotated[str, ForeignKey[UniqueTable.sid]]
+
+    registry.register([UniqueTable, SubTable])
+    registry.insert(UniqueTable(sid="id_0"))
+    with pytest.raises(UniqueViolation):
+        registry.insert(UniqueTable(sid="id_0"))
+    registry.insert(SubTable(text="hello", foreign_key="id_0"))
+    registry.remove_by(UniqueTable.partial_init(sid="id_0"))
+    assert len(registry.select_by(UniqueTable.partial_init())) == 0
+    assert len(registry.select_by(SubTable.partial_init())) == 0
+
+
 @pytest.mark.db
 def test_keyword_tokenizer(registry):
     Tockenizer = Keyword.with_model("wiki_tocken")

diff --git a/vechord/augment.py b/vechord/augment.py
@@ -78,19 +78,18 @@ def augment(self, chunks: list[str], prompt: str) -> list[str]:
             for chunk in chunks:
                 context = prompt.format(chunk=chunk)
                 if self.doc:
-                    context = f"<document>{self.doc}</document>\n" + context
+                    context = f"<document>\n{self.doc}\n</document>\n" + context
                 response = self.client.generate_content([context])
                 res.append(response.text)
         except Exception as e:
             logger.error("GeminiAugmenter error: %s", e)
-            breakpoint()
         return res
 
     def augment_context(self, chunks: list[str]) -> list[str]:
         """Generate the contextual chunks."""
         prompt = (
-            "Here is the chunk we want to situate within the whole document "
-            "<chunk>{chunk}</chunk>"
+            "Here is the chunk we want to situate within the whole document \n"
+            "<chunk>\n{chunk}\n</chunk>\n"
             "Please give a short succinct context to situate this chunk within "
             "the overall document for the purposes of improving search retrieval "
             "of the chunk. Answer only with the succinct context and nothing else."
@@ -100,8 +99,8 @@ def augment_context(self, chunks: list[str]) -> list[str]:
     def augment_query(self, chunks: list[str]) -> list[str]:
         """Generate the queries for chunks."""
         prompt = (
-            "Here is the chunk we want to ask questions about "
-            "<chunk>{chunk}</chunk>"
+            "Here is the chunk we want to ask questions about \n"
+            "<chunk>\n{chunk}\n</chunk>\n"
             "Please ask questions about this chunk based on the overall document "
             "for the purposes of improving search retrieval of the chunk. "
             "Answer only with the question and nothing else."