Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
[![GitHub License][license-badge]][license-link]
[![PyPI - Version][pypi-badge]][pypi-link]
[![Discord][discord-badge]][discord-link]
[![Blog][blog-badge]][blog-link]

</p>
<p><em>Turn PostgreSQL into your search engine in a Pythonic way.</em></p>
Expand Down Expand Up @@ -46,7 +47,8 @@ The related Docker images can be found in [VectorChord Suite][vectorchord-suite]
- [beir.py](examples/beir.py): the most flexible way to use the library (loading, indexing, querying and evaluation)
- [web.py](examples/web.py): build a web application with from the defined tables and pipeline
- [essay.py](examples/essay.py): extract the content from Paul Graham's essays and evaluate the search results from LLM generated queries
- [contextual.py](examples/contextual.py): contextual retrieval example
- [contextual.py](examples/contextual.py): contextual retrieval example with local PDF
- [anthropic.py](examples/anthropic.py): contextual retrieval with the Anthropic's Tutorial example
- [hybrid.py](examples/hybrid.py): hybrid search that rerank the results from vector search with keyword search

## User Guide
Expand Down Expand Up @@ -189,3 +191,5 @@ make format
[discord-badge]: https://img.shields.io/discord/974584200327991326?&logoColor=white&color=5865F2&style=flat&logo=discord&cacheSeconds=60
[discord-link]: https://discord.gg/KqswhpVgdU
[vectorchord-suite]: https://github.com/tensorchord/VectorChord-images
[blog-badge]: https://img.shields.io/badge/VectorChrod-Blog-DAFDBA
[blog-link]: https://blog.vectorchord.ai/
6 changes: 6 additions & 0 deletions docs/source/example.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
:code: python
```

## Contextual retrieval with the Anthropic example

```{include} ../../examples/anthropic.py
:code: python
```

## Evaluate with generated queries

```{include} ../../examples/essay.py
Expand Down
2 changes: 1 addition & 1 deletion docs/source/utils.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Toolkit

We provides some basic tools to help you build the RAG pipeline. But it's not limited to thses
We provides some basic tools to help you build the RAG pipeline. But it's not limited to these
internal tools. You can use whatever you like.

You may need to install with extras:
Expand Down
239 changes: 239 additions & 0 deletions examples/anthropic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
"""Anthropic Cookbook Contextual Embedding Example.

Data can be found from "https://github.com/anthropics/anthropic-cookbook".
"""

import json
from pathlib import Path
from time import perf_counter
from typing import Annotated, Optional

import httpx

from vechord.augment import GeminiAugmenter
from vechord.embedding import GeminiDenseEmbedding
from vechord.registry import VechordRegistry
from vechord.rerank import CohereReranker, ReciprocalRankFusion
from vechord.spec import (
ForeignKey,
Keyword,
PrimaryKeyAutoIncrease,
Table,
UniqueIndex,
Vector,
)

DenseVector = Vector[768]
emb = GeminiDenseEmbedding()
vr = VechordRegistry("anthropic", "postgresql://postgres:[email protected]:5432/")


class Document(Table, kw_only=True):
uid: Optional[PrimaryKeyAutoIncrease] = None
uuid: Annotated[str, UniqueIndex()]
content: str


class Chunk(Table, kw_only=True):
uid: Optional[PrimaryKeyAutoIncrease] = None
doc_uuid: Annotated[str, ForeignKey[Document.uuid]]
index: int
content: str
vector: DenseVector
keyword: Keyword


class ContextualChunk(Table, kw_only=True):
uid: Optional[PrimaryKeyAutoIncrease] = None
doc_uuid: Annotated[str, ForeignKey[Document.uuid]]
index: int
content: str
context: str
vector: DenseVector
keyword: Keyword


class Query(Table, kw_only=True):
uid: Optional[PrimaryKeyAutoIncrease] = None
content: str
answer: str
doc_uuids: list[str]
chunk_index: list[int]
vector: DenseVector


vr.register([Document, Chunk, ContextualChunk, Query])


def download_data(url: str, save_path: str):
if Path(save_path).is_file():
print(f"{save_path} already exists, skip download.")
return
with httpx.stream("GET", url) as response, open(save_path, "wb") as f:
for chunk in response.iter_bytes():
f.write(chunk)


def load_data(filepath: str):
with open(filepath, "r", encoding="utf-8") as f:
docs = json.load(f)
for doc in docs:
vr.insert(
Document(
uuid=doc["original_uuid"],
content=doc["content"],
)
)
for chunk in doc["chunks"]:
vr.insert(
Chunk(
doc_uuid=doc["original_uuid"],
index=chunk["original_index"],
content=chunk["content"],
vector=emb.vectorize_chunk(chunk["content"]),
keyword=Keyword(chunk["content"]),
)
)


def load_contextual_chunks(filepath: str):
augmenter = GeminiAugmenter()

with open(filepath, "r", encoding="utf-8") as f:
docs = json.load(f)
for doc in docs:
augmenter.reset(doc["content"])
chunks = doc["chunks"]
augments = augmenter.augment_context([chunk["content"] for chunk in chunks])
if len(augments) != len(chunks):
print(
f"augments length not match for uuid: {doc['original_uuid']}, {len(augments)} != {len(chunks)}"
)
for chunk, context in zip(chunks, augments, strict=False):
contextual_content = f"{chunk['content']}\n\n{context}"
vr.insert(
ContextualChunk(
doc_uuid=doc["original_uuid"],
index=chunk["original_index"],
content=chunk["content"],
context=context,
vector=emb.vectorize_chunk(contextual_content),
keyword=Keyword(contextual_content),
)
)


def load_query(filepath: str):
queries = []
with open(filepath, "r", encoding="utf-8") as f:
for line in f:
query = json.loads(line)
queries.append(
Query(
content=query["query"],
answer=query["answer"],
doc_uuids=[x[0] for x in query["golden_chunk_uuids"]],
chunk_index=[x[1] for x in query["golden_chunk_uuids"]],
vector=emb.vectorize_query(query["query"]),
)
)
vr.copy_bulk(queries)


def vector_search(query: Query, topk: int) -> list[Chunk]:
return vr.search_by_vector(Chunk, query.vector, topk=topk)


def vector_contextual_search(query: Query, topk: int) -> list[ContextualChunk]:
return vr.search_by_vector(ContextualChunk, query.vector, topk=topk)


def keyword_search(query: Query, topk: int) -> list[Chunk]:
return vr.search_by_keyword(Chunk, query.content, topk=topk)


def keyword_contextual_search(query: Query, topk: int) -> list[ContextualChunk]:
return vr.search_by_keyword(ContextualChunk, query.content, topk=topk)


def hybrid_search_fuse(query: Query, topk: int) -> list[Chunk]:
rrf = ReciprocalRankFusion()
return rrf.fuse([vector_search(query, topk), keyword_search(query, topk)])[:topk]


def hybrid_contextual_search_fuse(query: Query, topk: int) -> list[ContextualChunk]:
rrf = ReciprocalRankFusion()
return rrf.fuse(
[vector_contextual_search(query, topk), keyword_contextual_search(query, topk)]
)[:topk]


def hybrid_search_rerank(query: Query, topk: int, boost=3) -> list[Chunk]:
ranker = CohereReranker()
vecs = vector_search(query, topk * boost)
keys = keyword_search(query, topk * boost)
chunks = list({chunk.uid: chunk for chunk in vecs + keys}.values())
indices = ranker.rerank(query.content, [chunk.content for chunk in chunks])
return [chunks[i] for i in indices[:topk]]


def hybrid_contextual_search_rerank(
query: Query, topk: int, boost=3
) -> list[ContextualChunk]:
ranker = CohereReranker()
vecs = vector_contextual_search(query, topk * boost)
keys = keyword_contextual_search(query, topk * boost)
chunks = list({chunk.uid: chunk for chunk in vecs + keys}.values())
indices = ranker.rerank(
query.content, [f"{chunk.content}\n{chunk.context}" for chunk in chunks]
)
return [chunks[i] for i in indices[:topk]]


def evaluate(topk=5, search_func=vector_search):
print(f"TopK={topk}, search by: {search_func.__name__}")
queries: list[Query] = vr.select_by(Query.partial_init())
total_score = 0
start = perf_counter()
for query in queries:
chunks: list[Chunk] = search_func(query, topk)
count = 0
for doc_uuid, chunk_index in zip(
query.doc_uuids, query.chunk_index, strict=True
):
for chunk in chunks:
if chunk.doc_uuid == doc_uuid and chunk.index == chunk_index:
count += 1
break
score = count / len(query.doc_uuids)
total_score += score

print(
f"Pass@{topk}: {total_score / len(queries):.4f}, total queries: {len(queries)}, QPS: {len(queries) / (perf_counter() - start):.3f}"
)


if __name__ == "__main__":
Path("datasets").mkdir(parents=True, exist_ok=True)
download_data(
"https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/codebase_chunks.json",
"datasets/codebase_chunks.json",
)
download_data(
"https://raw.githubusercontent.com/anthropics/anthropic-cookbook/refs/heads/main/skills/contextual-embeddings/data/evaluation_set.jsonl",
"datasets/evaluation_set.jsonl",
)
load_data("datasets/codebase_chunks.json")
load_query("datasets/evaluation_set.jsonl")
load_contextual_chunks("datasets/codebase_chunks.json")

for topk in [5, 10]:
print("=" * 50)
evaluate(topk=topk, search_func=vector_search)
evaluate(topk=topk, search_func=keyword_search)
evaluate(topk=topk, search_func=hybrid_search_fuse)
evaluate(topk=topk, search_func=hybrid_search_rerank)
evaluate(topk=topk, search_func=vector_contextual_search)
evaluate(topk=topk, search_func=keyword_contextual_search)
evaluate(topk=topk, search_func=hybrid_contextual_search_fuse)
evaluate(topk=topk, search_func=hybrid_contextual_search_rerank)
23 changes: 23 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import msgspec
import numpy as np
import pytest
from psycopg.errors import UniqueViolation
from psycopg.types.json import Jsonb

from vechord.log import logger
Expand All @@ -17,6 +18,7 @@
PrimaryKeyAutoIncrease,
PrimaryKeyUUID,
Table,
UniqueIndex,
Vector,
VectorIndex,
)
Expand Down Expand Up @@ -113,6 +115,27 @@ def test_annotated_index(registry):
assert len(res) == topk


@pytest.mark.db
def test_unique_index(registry):
class UniqueTable(Table, kw_only=True):
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
sid: Annotated[str, UniqueIndex()]

class SubTable(Table, kw_only=True):
uid: PrimaryKeyUUID = msgspec.field(default_factory=PrimaryKeyUUID.factory)
text: str
foreign_key: Annotated[str, ForeignKey[UniqueTable.sid]]

registry.register([UniqueTable, SubTable])
registry.insert(UniqueTable(sid="id_0"))
with pytest.raises(UniqueViolation):
registry.insert(UniqueTable(sid="id_0"))
registry.insert(SubTable(text="hello", foreign_key="id_0"))
registry.remove_by(UniqueTable.partial_init(sid="id_0"))
assert len(registry.select_by(UniqueTable.partial_init())) == 0
assert len(registry.select_by(SubTable.partial_init())) == 0


@pytest.mark.db
def test_keyword_tokenizer(registry):
Tockenizer = Keyword.with_model("wiki_tocken")
Expand Down
11 changes: 5 additions & 6 deletions vechord/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,18 @@ def augment(self, chunks: list[str], prompt: str) -> list[str]:
for chunk in chunks:
context = prompt.format(chunk=chunk)
if self.doc:
context = f"<document>{self.doc}</document>\n" + context
context = f"<document>\n{self.doc}\n</document>\n" + context
response = self.client.generate_content([context])
res.append(response.text)
except Exception as e:
logger.error("GeminiAugmenter error: %s", e)
breakpoint()
return res

def augment_context(self, chunks: list[str]) -> list[str]:
"""Generate the contextual chunks."""
prompt = (
"Here is the chunk we want to situate within the whole document "
"<chunk>{chunk}</chunk>"
"Here is the chunk we want to situate within the whole document \n"
"<chunk>\n{chunk}\n</chunk>\n"
"Please give a short succinct context to situate this chunk within "
"the overall document for the purposes of improving search retrieval "
"of the chunk. Answer only with the succinct context and nothing else."
Expand All @@ -100,8 +99,8 @@ def augment_context(self, chunks: list[str]) -> list[str]:
def augment_query(self, chunks: list[str]) -> list[str]:
"""Generate the queries for chunks."""
prompt = (
"Here is the chunk we want to ask questions about "
"<chunk>{chunk}</chunk>"
"Here is the chunk we want to ask questions about \n"
"<chunk>\n{chunk}\n</chunk>\n"
"Please ask questions about this chunk based on the overall document "
"for the purposes of improving search retrieval of the chunk. "
"Answer only with the question and nothing else."
Expand Down
Loading