diff --git a/vechord/embedding.py b/vechord/embedding.py index 0abd2cd..87a6c50 100644 --- a/vechord/embedding.py +++ b/vechord/embedding.py @@ -1,4 +1,3 @@ -import base64 import os from abc import ABC, abstractmethod from enum import Enum, auto @@ -12,14 +11,16 @@ from vechord.model import ( GeminiEmbeddingRequest, JinaEmbeddingRequest, - MultiModalInput, SparseEmbedding, VoyageEmbeddingRequest, - VoyageEmbeddingResponse, VoyageMultiModalEmbeddingRequest, ) -from vechord.provider import GeminiEmbeddingProvider, JinaEmbeddingProvider -from vechord.utils import VOYAGE_EMBEDDING_RPS, RateLimitTransport +from vechord.model.voyage import VOYAGE_INPUT_TYPE +from vechord.provider import ( + GeminiEmbeddingProvider, + JinaEmbeddingProvider, + VoyageEmbeddingProvider, +) class VecType(Enum): @@ -137,42 +138,22 @@ def name(self) -> str: async def vectorize_chunk(self, text: str) -> np.ndarray: resp = await self.query( - JinaEmbeddingRequest.from_text(text, "retrieval.passage") + JinaEmbeddingRequest.from_text(text, "retrieval.passage", self.model) ) return resp.get_emb() async def vectorize_query(self, text: str) -> np.ndarray: - resp = await self.query(JinaEmbeddingRequest.from_text(text, "retrieval.query")) + resp = await self.query( + JinaEmbeddingRequest.from_text(text, "retrieval.query", self.model) + ) return resp.get_emb() -class VoyageDenseEmbedding(BaseEmbedding): +class VoyageDenseEmbedding(BaseEmbedding, VoyageEmbeddingProvider): def __init__( self, model: str = "voyage-3.5", dim: Literal[256, 512, 1024, 2048] = 1024 ): - self.api_key = os.environ.get("VOYAGE_API_KEY") - if not self.api_key: - raise ValueError("env VOYAGE_API_KEY not set") - - self.model = model - self.dim = dim - self.url = "https://api.voyageai.com/v1/embeddings" - self.client = httpx.AsyncClient( - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {self.api_key}", - }, - timeout=httpx.Timeout(30.0, connect=10.0), - transport=RateLimitTransport(max_per_second=VOYAGE_EMBEDDING_RPS), - ) - self.decoder = msgspec.json.Decoder(type=VoyageEmbeddingResponse) - self.encoder = msgspec.json.Encoder() - - async def __aenter__(self): - return self - - async def __aexit__(self, _exc_type, _exc_value, _traceback): - await self.client.aclose() + super().__init__(model, dim) def name(self): return f"voyage_emb_{self.model}_{self.dim}" @@ -183,33 +164,21 @@ def get_dim(self) -> int: def vec_type(self) -> VecType: return VecType.DENSE - async def vectorize( - self, text: str, input_type: Literal["document", "query"] = "document" - ) -> np.ndarray: - resp = await self.client.post( - url=self.url, - content=self.encoder.encode( - VoyageEmbeddingRequest( - model=self.model, - input_text=text, - input_type=input_type, - output_dimension=self.dim, - ) - ), - ) - if resp.is_error: - raise RuntimeError( - f"failed to call Voyage emb: [{resp.status_code}] {resp.content}" - ) - body = self.decoder.decode(resp.content) - emb = np.frombuffer(body.data[0].embedding, dtype=np.float32) - return emb - async def vectorize_chunk(self, text): - return await self.vectorize(text, "document") + resp = await self.query( + VoyageEmbeddingRequest.from_text( + text=text, input_type="document", model=self.model, dim=self.dim + ) + ) + return resp.get_emb() async def vectorize_query(self, text): - return await self.vectorize(text, "query") + resp = await self.query( + VoyageEmbeddingRequest.from_text( + text=text, input_type="query", model=self.model, dim=self.dim + ) + ) + return resp.get_emb() class VoyageMultiModalEmbedding(VoyageDenseEmbedding): @@ -228,7 +197,7 @@ def __init__(self, model="voyage-multimodal-3", dim=1024): def name(self): return f"voyage_multimodal_emb_{self.model}_{self.dim}" - async def vectorize(self, text, input_type: Literal["document", "query"] = "query"): + async def vectorize(self, text, input_type: VOYAGE_INPUT_TYPE = "query"): return await self.vectorize_multimodal(text=text, input_type=input_type) async def vectorize_multimodal( @@ -236,43 +205,23 @@ async def vectorize_multimodal( image: Optional[bytes] = None, text: Optional[str] = None, image_url: Optional[str] = None, - input_type: Literal["query", "document"] = "document", + input_type: VOYAGE_INPUT_TYPE = "document", ): if not (image or text or image_url): raise ValueError( "At least one of image, text, or image_url must be provided" ) - input_content = [] - if text: - input_content.append({"type": "text", "text": text}) - if image: - input_content.append( - { - "type": "image_base64", - "image_base64": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}", - } + resp = await self.query( + VoyageMultiModalEmbeddingRequest.build( + text=text, + image=image, + image_url=image_url, + input_type=input_type, + model=self.model, ) - if image_url: - input_content.append({"type": "image_url", "image_url": image_url}) - - resp = await self.client.post( - url=self.url, - content=self.encoder.encode( - VoyageMultiModalEmbeddingRequest( - model=self.model, - inputs=[MultiModalInput(content=input_content)], - input_type=input_type, - ) - ), ) - if resp.is_error: - raise RuntimeError( - f"failed to call Voyage multimodal emb: [{resp.status_code}] {resp.content}" - ) - body = self.decoder.decode(resp.content) - emb = np.frombuffer(body.data[0].embedding, dtype=np.float32) - return emb + return resp.get_emb() async def vectorize_multimodal_chunk( self, diff --git a/vechord/model/__init__.py b/vechord/model/__init__.py index ec08761..6604e90 100644 --- a/vechord/model/__init__.py +++ b/vechord/model/__init__.py @@ -14,7 +14,6 @@ ) from vechord.model.jina import JinaEmbeddingRequest, JinaEmbeddingResponse from vechord.model.voyage import ( - MultiModalInput, VoyageEmbeddingRequest, VoyageEmbeddingResponse, VoyageMultiModalEmbeddingRequest, @@ -32,7 +31,6 @@ "InputType", "JinaEmbeddingRequest", "JinaEmbeddingResponse", - "MultiModalInput", "Relation", "ResourceRequest", "RetrievedChunk", diff --git a/vechord/model/jina.py b/vechord/model/jina.py index 304b974..764d404 100644 --- a/vechord/model/jina.py +++ b/vechord/model/jina.py @@ -31,9 +31,9 @@ class JinaEmbeddingRequest(msgspec.Struct, kw_only=True, omit_defaults=True): input_content: list[JinaInput] = msgspec.field(name="input") @classmethod - def from_text(cls, text: str, task: JinaEmbeddingType) -> Self: + def from_text(cls, text: str, task: JinaEmbeddingType, model: str) -> Self: return JinaEmbeddingRequest( - model="jina-embeddings-v4", + model=model, truncate=True, task=task, embedding_type="base64", diff --git a/vechord/model/voyage.py b/vechord/model/voyage.py index 993aad2..4d6cd6d 100644 --- a/vechord/model/voyage.py +++ b/vechord/model/voyage.py @@ -1,7 +1,12 @@ +import base64 import re from typing import Literal, Optional import msgspec +import numpy as np + +from vechord.errors import UnexpectedResponseError +from vechord.typing import Self def pascal_to_snake(s: str) -> str: @@ -18,16 +23,39 @@ class VoyageEmbedding(msgspec.Struct, kw_only=True): class VoyageEmbeddingResponse(msgspec.Struct, kw_only=True): data: list[VoyageEmbedding] + def get_emb(self) -> np.ndarray: + """Get the first embedding as a numpy array.""" + if not self.data or not self.data[0].embedding: + raise UnexpectedResponseError("empty embedding data") + emb = self.data[0].embedding + if isinstance(emb, list): + return np.array(emb, dtype=np.float32) + return np.frombuffer(emb, dtype=np.float32) + + +VOYAGE_INPUT_TYPE = Literal["query", "document"] + class VoyageEmbeddingRequest(msgspec.Struct, kw_only=True): model: str input_text: str | list[str] = msgspec.field(name="input") - input_type: Literal["query", "document"] = "document" + input_type: VOYAGE_INPUT_TYPE = "document" truncation: bool = True output_dimension: int output_dtype: Literal["float", "int8", "uint8", "binary", "ubinary"] = "float" encoding_format: Optional[Literal["base64"]] = "base64" + @classmethod + def from_text( + cls, text: str, input_type: VOYAGE_INPUT_TYPE, model: str, dim: int + ) -> Self: + return VoyageEmbeddingRequest( + model=model, + input_text=text, + input_type=input_type, + output_dimension=dim, + ) + class Text(msgspec.Struct, tag=pascal_to_snake): text: str @@ -48,6 +76,32 @@ class MultiModalInput(msgspec.Struct, tag=pascal_to_snake): class VoyageMultiModalEmbeddingRequest(msgspec.Struct, kw_only=True): model: str inputs: list[MultiModalInput] - input_type: Literal["query", "document"] = "document" + input_type: VOYAGE_INPUT_TYPE = "document" truncation: bool = True encoding_format: Optional[Literal["base64"]] = "base64" + + @classmethod + def build( + cls, + text: Optional[str], + image_url: Optional[str], + image: Optional[bytes], + model: str, + input_type: VOYAGE_INPUT_TYPE, + ) -> Self: + contents = [] + if text: + contents.append(Text(text=text)) + if image_url: + contents.append(ImageURL(image_url=image_url)) + if image: + contents.append( + ImageBase64( + image_base64=f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}", + ) + ) + return VoyageMultiModalEmbeddingRequest( + model=model, + inputs=[MultiModalInput(content=contents)], + input_type=input_type, + ) diff --git a/vechord/provider.py b/vechord/provider.py index 105df3d..bb8fa23 100644 --- a/vechord/provider.py +++ b/vechord/provider.py @@ -12,11 +12,13 @@ GeminiGenerateResponse, JinaEmbeddingRequest, JinaEmbeddingResponse, + VoyageEmbeddingResponse, ) from vechord.utils import ( GEMINI_EMBEDDING_RPS, GEMINI_GENERATE_RPS, JINA_EMBEDDING_RPS, + VOYAGE_EMBEDDING_RPS, RateLimitTransport, ) @@ -135,3 +137,33 @@ async def query(self, req: JinaEmbeddingRequest) -> JinaEmbeddingResponse: "Failed to query Jina embedding", response.status_code, response.text ) return self.decoder.decode(response.content) + + +class VoyageEmbeddingProvider(BaseProvider): + """Voyage Embedding Provider.""" + + PROVIDER_NAME = "VOYAGE" + + def __init__(self, model: str = "voyage-3.5", dim: int = 1024): + super().__init__(model) + self.dim = dim + self.client = httpx.AsyncClient( + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + }, + timeout=httpx.Timeout(60.0, connect=10.0), + transport=RateLimitTransport(max_per_second=VOYAGE_EMBEDDING_RPS), + ) + self.url = "https://api.voyageai.com/v1/embeddings" + self.encoder = msgspec.json.Encoder() + self.decoder = msgspec.json.Decoder(VoyageEmbeddingResponse) + + async def query(self, req: GeminiEmbeddingRequest) -> VoyageEmbeddingResponse: + """Query the Voyage embedding model with a request.""" + response = await self.client.post(self.url, content=self.encoder.encode(req)) + if response.is_error: + raise HTTPCallError( + "Failed to query Voyage embedding", response.status_code, response.text + ) + return self.decoder.decode(response.content)