From 2506f4f8d27bb6e3f9a942d8a0e0c3f33ee73240 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Thu, 3 Jul 2025 17:44:07 +0200 Subject: [PATCH 01/20] Added backend class for SparseEncoder and also SentenceTransformersSparseTextEmbedder --- .../backends/sentence_transformers_backend.py | 76 +++- ...tence_transformers_sparse_text_embedder.py | 215 ++++++++++++ pyproject.toml | 2 +- ...tence_transformers_sparse_text_embedder.py | 327 ++++++++++++++++++ 4 files changed, 607 insertions(+), 13 deletions(-) create mode 100644 haystack/components/embedders/sentence_transformers_sparse_text_embedder.py create mode 100644 test/components/embedders/test_sentence_transformers_sparse_text_embedder.py diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index 96b000bbfa..e337bc00e1 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -7,8 +7,8 @@ from haystack.lazy_imports import LazyImport from haystack.utils.auth import Secret -with LazyImport(message="Run 'pip install \"sentence-transformers>=4.1.0\"'") as sentence_transformers_import: - from sentence_transformers import SentenceTransformer +with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as sentence_transformers_import: + from sentence_transformers import SentenceTransformer, SparseEncoder class _SentenceTransformersEmbeddingBackendFactory: @@ -30,15 +30,67 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", + sparse: bool = False, ): embedding_backend_id = f"{model}{device}{auth_token}{truncate_dim}{backend}" if embedding_backend_id in _SentenceTransformersEmbeddingBackendFactory._instances: return _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] - embedding_backend = _SentenceTransformersEmbeddingBackend( - model=model, + + if sparse: + embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend( + model=model, + device=device, + auth_token=auth_token, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) + else: + embedding_backend = _SentenceTransformersEmbeddingBackend( + model=model, + device=device, + auth_token=auth_token, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + truncate_dim=truncate_dim, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) + + _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _SentenceTransformersEmbeddingBackend: + """ + Class to manage Sentence Transformers embeddings. + """ + + def __init__( # pylint: disable=too-many-positional-arguments + self, + model: str, + device: Optional[str] = None, + auth_token: Optional[Secret] = None, + trust_remote_code: bool = False, + local_files_only: bool = False, + truncate_dim: Optional[int] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + config_kwargs: Optional[Dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + sentence_transformers_import.check() + + self.model = SentenceTransformer( + model_name_or_path=model, device=device, - auth_token=auth_token, + token=auth_token.resolve_value() if auth_token else None, trust_remote_code=trust_remote_code, local_files_only=local_files_only, truncate_dim=truncate_dim, @@ -47,13 +99,15 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments config_kwargs=config_kwargs, backend=backend, ) - _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend - return embedding_backend + def embed(self, data: List[str], **kwargs) -> List[List[float]]: + embeddings = self.model.encode(data, **kwargs).tolist() + return embeddings -class _SentenceTransformersEmbeddingBackend: + +class _SentenceTransformersSparseEncoderEmbeddingBackend: """ - Class to manage Sentence Transformers embeddings. + Class to manage Sparse embeddings from Sentence Transformers. """ def __init__( # pylint: disable=too-many-positional-arguments @@ -63,7 +117,6 @@ def __init__( # pylint: disable=too-many-positional-arguments auth_token: Optional[Secret] = None, trust_remote_code: bool = False, local_files_only: bool = False, - truncate_dim: Optional[int] = None, model_kwargs: Optional[Dict[str, Any]] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, @@ -71,13 +124,12 @@ def __init__( # pylint: disable=too-many-positional-arguments ): sentence_transformers_import.check() - self.model = SentenceTransformer( + self.model = SparseEncoder( model_name_or_path=model, device=device, token=auth_token.resolve_value() if auth_token else None, trust_remote_code=trust_remote_code, local_files_only=local_files_only, - truncate_dim=truncate_dim, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs, config_kwargs=config_kwargs, diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py new file mode 100644 index 0000000000..9247d75e8c --- /dev/null +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Literal, Optional + +from haystack import component, default_from_dict, default_to_dict +from haystack.components.embedders.backends.sentence_transformers_backend import ( + _SentenceTransformersEmbeddingBackendFactory, + _SentenceTransformersSparseEncoderEmbeddingBackend, +) +from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace +from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs + + +@component +class SentenceTransformersSparseTextEmbedder: + """ + Embeds strings using sprase embedding models from Sentence Transformers. + + You can use it to embed user query and send it to an embedding retriever. + + Usage example: + ```python + from haystack.components.embedders import SentenceTransformersSparseTextEmbedder + + text_to_embed = "I love pizza!" + + text_embedder = SentenceTransformersSparseTextEmbedder() + text_embedder.warm_up() + + print(text_embedder.run(text_to_embed)) + + # {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]} + ``` + """ + + def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments + self, + model: str = "naver/splade-cocondenser-ensembledistil", + device: Optional[ComponentDevice] = None, + token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + trust_remote_code: bool = False, + local_files_only: bool = False, + model_kwargs: Optional[Dict[str, Any]] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + config_kwargs: Optional[Dict[str, Any]] = None, + encode_kwargs: Optional[Dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + """ + Create a SentenceTransformersSparseTextEmbedder component. + + :param model: + The model to use for calculating embeddings. + Specify the path to a local model or the ID of the model on Hugging Face. + :param device: + Overrides the default device used to load the model. + :param token: + An API token to use private models from Hugging Face. + :param prefix: + A string to add at the beginning of each text to be embedded. + You can use it to prepend the text with an instruction, as required by some embedding models, + such as E5 and bge. + :param suffix: + A string to add at the end of each text to embed. + :param batch_size: + Number of texts to embed at once. + :param progress_bar: + If `True`, shows a progress bar for calculating embeddings. + If `False`, disables the progress bar. + :param trust_remote_code: + If `False`, permits only Hugging Face verified model architectures. + If `True`, permits custom models and scripts. + :param local_files_only: + If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files. + :param model_kwargs: + Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained` + when loading the model. Refer to specific model documentation for available kwargs. + :param tokenizer_kwargs: + Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer. + Refer to specific model documentation for available kwargs. + :param config_kwargs: + Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration. + :param encode_kwargs: + Additional keyword arguments for `SentenceTransformer.encode` when embedding texts. + This parameter is provided for fine customization. Be careful not to clash with already set parameters and + avoid passing parameters that change the output type. + :param backend: + The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino". + Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html) + for more information on acceleration and quantization options. + """ + + self.model = model + self.device = ComponentDevice.resolve_device(device) + self.token = token + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.trust_remote_code = trust_remote_code + self.local_files_only = local_files_only + self.model_kwargs = model_kwargs + self.tokenizer_kwargs = tokenizer_kwargs + self.config_kwargs = config_kwargs + self.encode_kwargs = encode_kwargs + self.embedding_backend: Optional[_SentenceTransformersSparseEncoderEmbeddingBackend] = None + self.backend = backend + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self.model} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + serialization_dict = default_to_dict( + self, + model=self.model, + device=self.device.to_dict(), + token=self.token.to_dict() if self.token else None, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + trust_remote_code=self.trust_remote_code, + local_files_only=self.local_files_only, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + encode_kwargs=self.encode_kwargs, + backend=self.backend, + ) + if serialization_dict["init_parameters"].get("model_kwargs") is not None: + serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"]) + return serialization_dict + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersSparseTextEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + init_params = data["init_parameters"] + if init_params.get("device") is not None: + init_params["device"] = ComponentDevice.from_dict(init_params["device"]) + deserialize_secrets_inplace(init_params, keys=["token"]) + if init_params.get("model_kwargs") is not None: + deserialize_hf_model_kwargs(init_params["model_kwargs"]) + return default_from_dict(cls, data) + + def warm_up(self): + """ + Initializes the component. + """ + if self.embedding_backend is None: + self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( + model=self.model, + device=self.device.to_torch_str(), + auth_token=self.token, + trust_remote_code=self.trust_remote_code, + local_files_only=self.local_files_only, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + backend=self.backend, + sparse=True, + ) + if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): + self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """ + Embed a single string. + + :param text: + Text to embed. + + :returns: + A dictionary with the following keys: + - `embedding`: The embedding of the input text. + """ + if not isinstance(text, str): + raise TypeError( + "SentenceTransformersSparseTextEmbedder expects a string as input." + "In case you want to embed a list of Documents, please use the" + "SentenceTransformersSparseDocumentEmbedder." + ) + if self.embedding_backend is None: + raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + + text_to_embed = self.prefix + text + self.suffix + embedding = self.embedding_backend.embed( + [text_to_embed], + batch_size=self.batch_size, + show_progress_bar=self.progress_bar, + **(self.encode_kwargs if self.encode_kwargs else {}), + )[0] + return {"embedding": embedding} diff --git a/pyproject.toml b/pyproject.toml index 05d12a2a38..9527df5ba3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ dependencies = [ "transformers[torch, sentencepiece]>=4.52.4,<4.53", # ExtractiveReader, TransformersSimilarityRanker, LocalWhisperTranscriber, HFGenerators... "huggingface_hub>=0.27.0", # Hugging Face API Generators and Embedders - "sentence-transformers>=4.1.0", # Sentence Transformers Embedders, Rankers, and SASEvaluator + "sentence-transformers>=5.0.0", # Sentence Transformers Embedders, Rankers, and SASEvaluator "langdetect", # TextLanguageRouter and DocumentLanguageClassifier "openai-whisper>=20231106", # LocalWhisperTranscriber "arrow>=1.3.0", # Jinja2TimeExtension diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py new file mode 100644 index 0000000000..dc5ca4c770 --- /dev/null +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -0,0 +1,327 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import random +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from haystack.components.embedders.sentence_transformers_sparse_text_embedder import ( + SentenceTransformersSparseTextEmbedder, +) +from haystack.utils import ComponentDevice, Secret + + +class TestSentenceTransformersSparseTextEmbedder: + def test_init_default(self): + embedder = SentenceTransformersSparseTextEmbedder(model="model") + assert embedder.model == "model" + assert embedder.device == ComponentDevice.resolve_device(None) + assert embedder.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.trust_remote_code is False + assert embedder.local_files_only is False + + def test_init_with_parameters(self): + embedder = SentenceTransformersSparseTextEmbedder( + model="model", + device=ComponentDevice.from_str("cuda:0"), + token=Secret.from_token("fake-api-token"), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + trust_remote_code=True, + local_files_only=True, + ) + assert embedder.model == "model" + assert embedder.device == ComponentDevice.from_str("cuda:0") + assert embedder.token == Secret.from_token("fake-api-token") + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.trust_remote_code is True + assert embedder.local_files_only is True + + def test_to_dict(self): + component = SentenceTransformersSparseTextEmbedder(model="model", device=ComponentDevice.from_str("cpu")) + data = component.to_dict() + assert data == { + "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "init_parameters": { + "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, + "model": "model", + "device": ComponentDevice.from_str("cpu").to_dict(), + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "trust_remote_code": False, + "local_files_only": False, + "model_kwargs": None, + "tokenizer_kwargs": None, + "encode_kwargs": None, + "config_kwargs": None, + "backend": "torch", + }, + } + + def test_to_dict_with_custom_init_parameters(self): + component = SentenceTransformersSparseTextEmbedder( + model="model", + device=ComponentDevice.from_str("cuda:0"), + token=Secret.from_env_var("ENV_VAR", strict=False), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + trust_remote_code=True, + local_files_only=True, + model_kwargs={"torch_dtype": torch.float32}, + tokenizer_kwargs={"model_max_length": 512}, + config_kwargs={"use_memory_efficient_attention": False}, + encode_kwargs={"task": "clustering"}, + ) + data = component.to_dict() + assert data == { + "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "init_parameters": { + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "model": "model", + "device": ComponentDevice.from_str("cuda:0").to_dict(), + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "trust_remote_code": True, + "local_files_only": True, + "model_kwargs": {"torch_dtype": "torch.float32"}, + "tokenizer_kwargs": {"model_max_length": 512}, + "config_kwargs": {"use_memory_efficient_attention": False}, + "encode_kwargs": {"task": "clustering"}, + "backend": "torch", + }, + } + + def test_to_dict_not_serialize_token(self): + component = SentenceTransformersSparseTextEmbedder(model="model", token=Secret.from_token("fake-api-token")) + with pytest.raises(ValueError, match="Cannot serialize token-based secret"): + component.to_dict() + + def test_from_dict(self): + data = { + "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "init_parameters": { + "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, + "model": "model", + "device": ComponentDevice.from_str("cpu").to_dict(), + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "trust_remote_code": False, + "local_files_only": False, + "model_kwargs": {"torch_dtype": "torch.float32"}, + "tokenizer_kwargs": {"model_max_length": 512}, + "config_kwargs": {"use_memory_efficient_attention": False}, + }, + } + component = SentenceTransformersSparseTextEmbedder.from_dict(data) + assert component.model == "model" + assert component.device == ComponentDevice.from_str("cpu") + assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert component.prefix == "" + assert component.suffix == "" + assert component.batch_size == 32 + assert component.progress_bar is True + assert component.trust_remote_code is False + assert component.local_files_only is False + assert component.model_kwargs == {"torch_dtype": torch.float32} + assert component.tokenizer_kwargs == {"model_max_length": 512} + assert component.config_kwargs == {"use_memory_efficient_attention": False} + + def test_from_dict_no_default_parameters(self): + data = { + "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "init_parameters": {}, + } + component = SentenceTransformersSparseTextEmbedder.from_dict(data) + assert component.model == "naver/splade-cocondenser-ensembledistil" + assert component.device == ComponentDevice.resolve_device(None) + assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert component.prefix == "" + assert component.suffix == "" + assert component.batch_size == 32 + assert component.progress_bar is True + assert component.trust_remote_code is False + assert component.local_files_only is False + + def test_from_dict_none_device(self): + data = { + "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "init_parameters": { + "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, + "model": "model", + "device": None, + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "trust_remote_code": False, + "local_files_only": False, + }, + } + component = SentenceTransformersSparseTextEmbedder.from_dict(data) + assert component.model == "model" + assert component.device == ComponentDevice.resolve_device(None) + assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert component.prefix == "" + assert component.suffix == "" + assert component.batch_size == 32 + assert component.progress_bar is True + assert component.trust_remote_code is False + assert component.local_files_only is False + + @patch( + "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + embedder = SentenceTransformersSparseTextEmbedder( + model="model", + token=None, + device=ComponentDevice.from_str("cpu"), + tokenizer_kwargs={"model_max_length": 512}, + ) + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.embedding_backend.model.max_seq_length = 512 + mocked_factory.get_embedding_backend.assert_called_once_with( + model="model", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs=None, + tokenizer_kwargs={"model_max_length": 512}, + config_kwargs=None, + backend="torch", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_warmup_doesnt_reload(self, mocked_factory): + embedder = SentenceTransformersSparseTextEmbedder(model="model") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_run(self): + embedder = SentenceTransformersSparseTextEmbedder(model="model") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: [ + [random.random() for _ in range(16)] for _ in range(len(x)) + ] + + text = "a nice text to embed" + + result = embedder.run(text=text) + embedding = result["embedding"] + + assert isinstance(embedding, list) + assert all(isinstance(el, float) for el in embedding) + + def test_run_wrong_input_format(self): + embedder = SentenceTransformersSparseTextEmbedder(model="model") + embedder.embedding_backend = MagicMock() + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="SentenceTransformersSparseTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) + + @patch( + "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_model_onnx_backend(self, mocked_factory): + onnx_embedder = SentenceTransformersSparseTextEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cpu"), + model_kwargs={ + "file_name": "onnx/model.onnx" + }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to prevent a HF warning + backend="onnx", + ) + onnx_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs={"file_name": "onnx/model.onnx"}, + tokenizer_kwargs=None, + config_kwargs=None, + backend="onnx", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_model_openvino_backend(self, mocked_factory): + openvino_embedder = SentenceTransformersSparseTextEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cpu"), + model_kwargs={ + "file_name": "openvino/openvino_model.xml" + }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but this is to prevent a HF warning + backend="openvino", + ) + openvino_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs={"file_name": "openvino/openvino_model.xml"}, + tokenizer_kwargs=None, + config_kwargs=None, + backend="openvino", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) + def test_dtype_on_gpu(self, mocked_factory, model_kwargs): + torch_dtype_embedder = SentenceTransformersSparseTextEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cuda:0"), + model_kwargs=model_kwargs, + ) + torch_dtype_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cuda:0", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs=model_kwargs, + tokenizer_kwargs=None, + config_kwargs=None, + backend="torch", + ) From abd7ea5ebd9baf352c59046932754e46a5d980a3 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Thu, 3 Jul 2025 18:33:50 +0200 Subject: [PATCH 02/20] Added SentenceTransformersSparseDocumentEmbedder --- ...e_transformers_sparse_document_embedder.py | 231 ++++++++++ ...e_transformers_sparse_document_embedder.py | 401 ++++++++++++++++++ 2 files changed, 632 insertions(+) create mode 100644 haystack/components/embedders/sentence_transformers_sparse_document_embedder.py create mode 100644 test/components/embedders/test_sentence_transformers_sparse_document_embedder.py diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py new file mode 100644 index 0000000000..1215c09748 --- /dev/null +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -0,0 +1,231 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Literal, Optional + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.components.embedders.backends.sentence_transformers_backend import ( + _SentenceTransformersEmbeddingBackendFactory, + _SentenceTransformersSparseEncoderEmbeddingBackend, +) +from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace +from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs + + +@component +class SentenceTransformersSparseDocumentEmbedder: + """ + Calculates document embeddings using sprase embedding models from Sentence Transformers. + + It stores the embeddings in the `embedding` metadata field of each document. + You can also embed documents' metadata. + Use this component in indexing pipelines to embed input documents + and send them to DocumentWriter to write a into a Document Store. + + ### Usage example: + + ```python + from haystack import Document + from haystack.components.embedders import SentenceTransformersSparseDocumentEmbedder + doc = Document(content="I love pizza!") + doc_embedder = SentenceTransformersSparseDocumentEmbedder() + doc_embedder.warm_up() + + result = doc_embedder.run([doc]) + print(result['documents'][0].embedding) + + # [-0.07804739475250244, 0.1498992145061493, ...] + ``` + """ + + def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments + self, + model: str = "naver/splade-cocondenser-ensembledistil", + device: Optional[ComponentDevice] = None, + token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), + prefix: str = "", + suffix: str = "", + batch_size: int = 32, + progress_bar: bool = True, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + trust_remote_code: bool = False, + local_files_only: bool = False, + model_kwargs: Optional[Dict[str, Any]] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + config_kwargs: Optional[Dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + """ + Creates a SentenceTransformersDocumentEmbedder component. + + :param model: + The model to use for calculating embeddings. + Pass a local path or ID of the model on Hugging Face. + :param device: + The device to use for loading the model. + Overrides the default device. + :param token: + The API token to download private models from Hugging Face. + :param prefix: + A string to add at the beginning of each document text. + Can be used to prepend the text with an instruction, as required by some embedding models, + such as E5 and bge. + :param suffix: + A string to add at the end of each document text. + :param batch_size: + Number of documents to embed at once. + :param progress_bar: + If `True`, shows a progress bar when embedding documents. + :param meta_fields_to_embed: + List of metadata fields to embed along with the document text. + :param embedding_separator: + Separator used to concatenate the metadata fields to the document text. + :param trust_remote_code: + If `False`, allows only Hugging Face verified model architectures. + If `True`, allows custom models and scripts. + :param local_files_only: + If `True`, does not attempt to download the model from Hugging Face Hub and only looks at local files. + :param model_kwargs: + Additional keyword arguments for `AutoModelForSequenceClassification.from_pretrained` + when loading the model. Refer to specific model documentation for available kwargs. + :param tokenizer_kwargs: + Additional keyword arguments for `AutoTokenizer.from_pretrained` when loading the tokenizer. + Refer to specific model documentation for available kwargs. + :param config_kwargs: + Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration. + :param backend: + The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino". + Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html) + for more information on acceleration and quantization options. + """ + + self.model = model + self.device = ComponentDevice.resolve_device(device) + self.token = token + self.prefix = prefix + self.suffix = suffix + self.batch_size = batch_size + self.progress_bar = progress_bar + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + self.trust_remote_code = trust_remote_code + self.local_files_only = local_files_only + self.model_kwargs = model_kwargs + self.tokenizer_kwargs = tokenizer_kwargs + self.config_kwargs = config_kwargs + self.embedding_backend: Optional[_SentenceTransformersSparseEncoderEmbeddingBackend] = None + self.backend = backend + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self.model} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + serialization_dict = default_to_dict( + self, + model=self.model, + device=self.device.to_dict(), + token=self.token.to_dict() if self.token else None, + prefix=self.prefix, + suffix=self.suffix, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + trust_remote_code=self.trust_remote_code, + local_files_only=self.local_files_only, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + backend=self.backend, + ) + if serialization_dict["init_parameters"].get("model_kwargs") is not None: + serialize_hf_model_kwargs(serialization_dict["init_parameters"]["model_kwargs"]) + return serialization_dict + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersSparseDocumentEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + init_params = data["init_parameters"] + if init_params.get("device") is not None: + init_params["device"] = ComponentDevice.from_dict(init_params["device"]) + deserialize_secrets_inplace(init_params, keys=["token"]) + if init_params.get("model_kwargs") is not None: + deserialize_hf_model_kwargs(init_params["model_kwargs"]) + return default_from_dict(cls, data) + + def warm_up(self): + """ + Initializes the component. + """ + if self.embedding_backend is None: + self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( + model=self.model, + device=self.device.to_torch_str(), + auth_token=self.token, + trust_remote_code=self.trust_remote_code, + local_files_only=self.local_files_only, + model_kwargs=self.model_kwargs, + tokenizer_kwargs=self.tokenizer_kwargs, + config_kwargs=self.config_kwargs, + backend=self.backend, + sparse=True, + ) + if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): + self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of documents. + + :param documents: + Documents to embed. + + :returns: + A dictionary with the following keys: + - `documents`: Documents with embeddings. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError( + "SentenceTransformersSparseDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a list of strings, please use the SentenceTransformersSparseTextEmbedder." + ) + if self.embedding_backend is None: + raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") + + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] + ] + text_to_embed = ( + self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix + ) + texts_to_embed.append(text_to_embed) + + embeddings = self.embedding_backend.embed( + texts_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar + ) + + for doc, emb in zip(documents, embeddings): + doc.embedding = emb + + return {"documents": documents} diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py new file mode 100644 index 0000000000..497e301474 --- /dev/null +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -0,0 +1,401 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import random +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from haystack import Document +from haystack.components.embedders.sentence_transformers_sparse_document_embedder import ( + SentenceTransformersSparseDocumentEmbedder, +) +from haystack.utils import ComponentDevice, Secret + + +class TestSentenceTransformersDocumentEmbedder: + def test_init_default(self): + embedder = SentenceTransformersSparseDocumentEmbedder(model="model") + assert embedder.model == "model" + assert embedder.device == ComponentDevice.resolve_device(None) + assert embedder.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert embedder.prefix == "" + assert embedder.suffix == "" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + assert embedder.trust_remote_code is False + assert embedder.local_files_only is False + + def test_init_with_parameters(self): + embedder = SentenceTransformersSparseDocumentEmbedder( + model="model", + device=ComponentDevice.from_str("cuda:0"), + token=Secret.from_token("fake-api-token"), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + trust_remote_code=True, + local_files_only=True, + ) + assert embedder.model == "model" + assert embedder.device == ComponentDevice.from_str("cuda:0") + assert embedder.token == Secret.from_token("fake-api-token") + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + assert embedder.trust_remote_code + assert embedder.local_files_only + + def test_to_dict(self): + component = SentenceTransformersSparseDocumentEmbedder(model="model", device=ComponentDevice.from_str("cpu")) + data = component.to_dict() + assert data == { + "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "init_parameters": { + "model": "model", + "device": ComponentDevice.from_str("cpu").to_dict(), + "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, + "prefix": "", + "suffix": "", + "batch_size": 32, + "progress_bar": True, + "embedding_separator": "\n", + "meta_fields_to_embed": [], + "trust_remote_code": False, + "local_files_only": False, + "model_kwargs": None, + "tokenizer_kwargs": None, + "config_kwargs": None, + "backend": "torch", + }, + } + + def test_to_dict_with_custom_init_parameters(self): + component = SentenceTransformersSparseDocumentEmbedder( + model="model", + device=ComponentDevice.from_str("cuda:0"), + token=Secret.from_env_var("ENV_VAR", strict=False), + prefix="prefix", + suffix="suffix", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["meta_field"], + embedding_separator=" - ", + trust_remote_code=True, + local_files_only=True, + model_kwargs={"torch_dtype": torch.float32}, + tokenizer_kwargs={"model_max_length": 512}, + config_kwargs={"use_memory_efficient_attention": True}, + ) + data = component.to_dict() + + assert data == { + "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "init_parameters": { + "model": "model", + "device": ComponentDevice.from_str("cuda:0").to_dict(), + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "embedding_separator": " - ", + "trust_remote_code": True, + "local_files_only": True, + "meta_fields_to_embed": ["meta_field"], + "model_kwargs": {"torch_dtype": "torch.float32"}, + "tokenizer_kwargs": {"model_max_length": 512}, + "config_kwargs": {"use_memory_efficient_attention": True}, + "backend": "torch", + }, + } + + def test_from_dict(self): + init_parameters = { + "model": "model", + "device": ComponentDevice.from_str("cuda:0").to_dict(), + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "embedding_separator": " - ", + "meta_fields_to_embed": ["meta_field"], + "trust_remote_code": True, + "local_files_only": True, + "model_kwargs": {"torch_dtype": "torch.float32"}, + "tokenizer_kwargs": {"model_max_length": 512}, + "config_kwargs": {"use_memory_efficient_attention": True}, + } + component = SentenceTransformersSparseDocumentEmbedder.from_dict( + { + "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "init_parameters": init_parameters, + } + ) + assert component.model == "model" + assert component.device == ComponentDevice.from_str("cuda:0") + assert component.token == Secret.from_env_var("ENV_VAR", strict=False) + assert component.prefix == "prefix" + assert component.suffix == "suffix" + assert component.batch_size == 64 + assert component.progress_bar is False + assert component.embedding_separator == " - " + assert component.trust_remote_code + assert component.local_files_only + assert component.meta_fields_to_embed == ["meta_field"] + assert component.model_kwargs == {"torch_dtype": torch.float32} + assert component.tokenizer_kwargs == {"model_max_length": 512} + assert component.config_kwargs == {"use_memory_efficient_attention": True} + + def test_from_dict_no_default_parameters(self): + component = SentenceTransformersSparseDocumentEmbedder.from_dict( + { + "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "init_parameters": {}, + } + ) + assert component.model == "naver/splade-cocondenser-ensembledistil" + assert component.device == ComponentDevice.resolve_device(None) + assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) + assert component.prefix == "" + assert component.suffix == "" + assert component.batch_size == 32 + assert component.progress_bar is True + assert component.embedding_separator == "\n" + assert component.trust_remote_code is False + assert component.local_files_only is False + assert component.meta_fields_to_embed == [] + + def test_from_dict_none_device(self): + init_parameters = { + "model": "model", + "device": None, + "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, + "prefix": "prefix", + "suffix": "suffix", + "batch_size": 64, + "progress_bar": False, + "embedding_separator": " - ", + "meta_fields_to_embed": ["meta_field"], + "trust_remote_code": True, + "local_files_only": False, + } + component = SentenceTransformersSparseDocumentEmbedder.from_dict( + { + "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "init_parameters": init_parameters, + } + ) + assert component.model == "model" + assert component.device == ComponentDevice.resolve_device(None) + assert component.token == Secret.from_env_var("ENV_VAR", strict=False) + assert component.prefix == "prefix" + assert component.suffix == "suffix" + assert component.batch_size == 64 + assert component.progress_bar is False + assert component.embedding_separator == " - " + assert component.trust_remote_code + assert component.local_files_only is False + assert component.meta_fields_to_embed == ["meta_field"] + + @patch( + "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_warmup(self, mocked_factory): + embedder = SentenceTransformersSparseDocumentEmbedder( + model="model", + token=None, + device=ComponentDevice.from_str("cpu"), + tokenizer_kwargs={"model_max_length": 512}, + config_kwargs={"use_memory_efficient_attention": True}, + ) + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.embedding_backend.model.max_seq_length = 512 + mocked_factory.get_embedding_backend.assert_called_once_with( + model="model", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs=None, + tokenizer_kwargs={"model_max_length": 512}, + config_kwargs={"use_memory_efficient_attention": True}, + backend="torch", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_warmup_doesnt_reload(self, mocked_factory): + embedder = SentenceTransformersSparseDocumentEmbedder(model="model") + mocked_factory.get_embedding_backend.assert_not_called() + embedder.warm_up() + embedder.warm_up() + mocked_factory.get_embedding_backend.assert_called_once() + + def test_run(self): + embedder = SentenceTransformersSparseDocumentEmbedder(model="model") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: [ + [random.random() for _ in range(16)] for _ in range(len(x)) + ] + + documents = [Document(content=f"document number {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + def test_run_wrong_input_format(self): + embedder = SentenceTransformersSparseDocumentEmbedder(model="model") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises( + TypeError, match="SentenceTransformersSparseDocumentEmbedder expects a list of Documents as input" + ): + embedder.run(documents=string_input) + + with pytest.raises( + TypeError, match="SentenceTransformersSparseDocumentEmbedder expects a list of Documents as input" + ): + embedder.run(documents=list_integers_input) + + def test_embed_metadata(self): + embedder = SentenceTransformersSparseDocumentEmbedder( + model="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n" + ) + embedder.embedding_backend = MagicMock() + documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] + embedder.run(documents=documents) + embedder.embedding_backend.embed.assert_called_once_with( + [ + "meta_value 0\ndocument number 0", + "meta_value 1\ndocument number 1", + "meta_value 2\ndocument number 2", + "meta_value 3\ndocument number 3", + "meta_value 4\ndocument number 4", + ], + batch_size=32, + show_progress_bar=True, + ) + + def test_prefix_suffix(self): + embedder = SentenceTransformersSparseDocumentEmbedder( + model="model", + prefix="my_prefix ", + suffix=" my_suffix", + meta_fields_to_embed=["meta_field"], + embedding_separator="\n", + ) + embedder.embedding_backend = MagicMock() + documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] + embedder.run(documents=documents) + embedder.embedding_backend.embed.assert_called_once_with( + [ + "my_prefix meta_value 0\ndocument number 0 my_suffix", + "my_prefix meta_value 1\ndocument number 1 my_suffix", + "my_prefix meta_value 2\ndocument number 2 my_suffix", + "my_prefix meta_value 3\ndocument number 3 my_suffix", + "my_prefix meta_value 4\ndocument number 4 my_suffix", + ], + batch_size=32, + show_progress_bar=True, + ) + + @patch( + "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_model_onnx_backend(self, mocked_factory): + onnx_embedder = SentenceTransformersSparseDocumentEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cpu"), + model_kwargs={ + "file_name": "onnx/model.onnx" + }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to prevent a HF warning + backend="onnx", + ) + onnx_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs={"file_name": "onnx/model.onnx"}, + tokenizer_kwargs=None, + config_kwargs=None, + backend="onnx", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + def test_model_openvino_backend(self, mocked_factory): + openvino_embedder = SentenceTransformersSparseDocumentEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cpu"), + model_kwargs={ + "file_name": "openvino/openvino_model.xml" + }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but this is to prevent a HF warning + backend="openvino", + ) + openvino_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cpu", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs={"file_name": "openvino/openvino_model.xml"}, + tokenizer_kwargs=None, + config_kwargs=None, + backend="openvino", + ) + + @patch( + "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + ) + @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) + def test_dtype_on_gpu(self, mocked_factory, model_kwargs): + torch_dtype_embedder = SentenceTransformersSparseDocumentEmbedder( + model="naver/splade-cocondenser-ensembledistil", + token=None, + device=ComponentDevice.from_str("cuda:0"), + model_kwargs=model_kwargs, + ) + torch_dtype_embedder.warm_up() + + mocked_factory.get_embedding_backend.assert_called_once_with( + model="naver/splade-cocondenser-ensembledistil", + device="cuda:0", + auth_token=None, + trust_remote_code=False, + local_files_only=False, + model_kwargs=model_kwargs, + tokenizer_kwargs=None, + config_kwargs=None, + backend="torch", + ) From 82b87c2cfad281e2af3427c6ed379b275e5fe941 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 3 Aug 2025 17:18:27 +0200 Subject: [PATCH 03/20] Created a separate _SentenceTransformersSparseEmbeddingBackendFactory and added tests --- .../backends/sentence_transformers_backend.py | 77 +++++++++++++------ ...e_transformers_sparse_document_embedder.py | 4 +- ...tence_transformers_sparse_text_embedder.py | 4 +- ...sentence_transformers_embedding_backend.py | 48 ++++++++++++ 4 files changed, 104 insertions(+), 29 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index e337bc00e1..835dfe8e3a 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -37,36 +37,63 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments if embedding_backend_id in _SentenceTransformersEmbeddingBackendFactory._instances: return _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] - if sparse: - embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend( - model=model, - device=device, - auth_token=auth_token, - trust_remote_code=trust_remote_code, - local_files_only=local_files_only, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - config_kwargs=config_kwargs, - backend=backend, - ) - else: - embedding_backend = _SentenceTransformersEmbeddingBackend( - model=model, - device=device, - auth_token=auth_token, - trust_remote_code=trust_remote_code, - local_files_only=local_files_only, - truncate_dim=truncate_dim, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - config_kwargs=config_kwargs, - backend=backend, - ) + embedding_backend = _SentenceTransformersEmbeddingBackend( + model=model, + device=device, + auth_token=auth_token, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + truncate_dim=truncate_dim, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) _SentenceTransformersEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend return embedding_backend +class _SentenceTransformersSparseEmbeddingBackendFactory: + """ + Factory class to create instances of Sentence Transformers embedding backends. + """ + + _instances: Dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} + + @staticmethod + def get_embedding_backend( # pylint: disable=too-many-positional-arguments + model: str, + device: Optional[str] = None, + auth_token: Optional[Secret] = None, + trust_remote_code: bool = False, + local_files_only: bool = False, + model_kwargs: Optional[Dict[str, Any]] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + config_kwargs: Optional[Dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + embedding_backend_id = f"{model}{device}{auth_token}{backend}" + + if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances: + return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend( + model=model, + device=device, + auth_token=auth_token, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) + + _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + class _SentenceTransformersEmbeddingBackend: """ Class to manage Sentence Transformers embeddings. diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 1215c09748..c4f80a068f 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -6,7 +6,7 @@ from haystack import Document, component, default_from_dict, default_to_dict from haystack.components.embedders.backends.sentence_transformers_backend import ( - _SentenceTransformersEmbeddingBackendFactory, + _SentenceTransformersSparseEmbeddingBackendFactory, _SentenceTransformersSparseEncoderEmbeddingBackend, ) from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace @@ -176,7 +176,7 @@ def warm_up(self): Initializes the component. """ if self.embedding_backend is None: - self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( + self.embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( model=self.model, device=self.device.to_torch_str(), auth_token=self.token, diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 9247d75e8c..388bbcf644 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -6,7 +6,7 @@ from haystack import component, default_from_dict, default_to_dict from haystack.components.embedders.backends.sentence_transformers_backend import ( - _SentenceTransformersEmbeddingBackendFactory, + _SentenceTransformersSparseEmbeddingBackendFactory, _SentenceTransformersSparseEncoderEmbeddingBackend, ) from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace @@ -169,7 +169,7 @@ def warm_up(self): Initializes the component. """ if self.embedding_backend is None: - self.embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( + self.embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( model=self.model, device=self.device.to_torch_str(), auth_token=self.token, diff --git a/test/components/embedders/test_sentence_transformers_embedding_backend.py b/test/components/embedders/test_sentence_transformers_embedding_backend.py index 3de0f0d9cc..ce2e5c489c 100644 --- a/test/components/embedders/test_sentence_transformers_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_embedding_backend.py @@ -8,6 +8,7 @@ from haystack.components.embedders.backends.sentence_transformers_backend import ( _SentenceTransformersEmbeddingBackendFactory, + _SentenceTransformersSparseEmbeddingBackendFactory, ) from haystack.utils.auth import Secret @@ -26,6 +27,20 @@ def test_factory_behavior(mock_sentence_transformer): assert another_embedding_backend is not embedding_backend +@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") +def test_sparse_factory_behavior(mock_sparse_encoder): + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="my_model", device="cpu" + ) + same_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend("my_model", "cpu") + another_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="another_model", device="cpu" + ) + + assert same_embedding_backend is embedding_backend + assert another_embedding_backend is not embedding_backend + + @patch("haystack.components.embedders.backends.sentence_transformers_backend.SentenceTransformer") def test_model_initialization(mock_sentence_transformer): _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( @@ -51,6 +66,29 @@ def test_model_initialization(mock_sentence_transformer): ) +@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") +def test_sparse_model_initialization(mock_sparse_encoder): + _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="model", + device="cpu", + auth_token=Secret.from_token("fake-api-token"), + trust_remote_code=True, + local_files_only=True, + backend="torch", + ) + mock_sparse_encoder.assert_called_once_with( + model_name_or_path="model", + device="cpu", + token="fake-api-token", + trust_remote_code=True, + local_files_only=True, + model_kwargs=None, + tokenizer_kwargs=None, + config_kwargs=None, + backend="torch", + ) + + @patch("haystack.components.embedders.backends.sentence_transformers_backend.SentenceTransformer") def test_embedding_function_with_kwargs(mock_sentence_transformer): embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(model="model") @@ -59,3 +97,13 @@ def test_embedding_function_with_kwargs(mock_sentence_transformer): embedding_backend.embed(data=data, normalize_embeddings=True) embedding_backend.model.encode.assert_called_once_with(data, normalize_embeddings=True) + + +@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") +def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") + + data = ["sentence1", "sentence2"] + embedding_backend.embed(data=data, attn_implementation="sdpa") + + embedding_backend.model.encode.assert_called_once_with(data, attn_implementation="sdpa") From 73eaa97ad4b18927e65b823019e628bb30dff2f6 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 3 Aug 2025 17:46:27 +0200 Subject: [PATCH 04/20] Remove unused parameter --- .../embedders/backends/sentence_transformers_backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index 835dfe8e3a..78a4278158 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -30,7 +30,6 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments tokenizer_kwargs: Optional[Dict[str, Any]] = None, config_kwargs: Optional[Dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", - sparse: bool = False, ): embedding_backend_id = f"{model}{device}{auth_token}{truncate_dim}{backend}" From 74c222e8e55efe57cba568545a3031d1fdddd189 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 3 Aug 2025 19:15:13 +0200 Subject: [PATCH 05/20] Wrapped output into SparseEmbedding dataclass + fix tests --- .../backends/sentence_transformers_backend.py | 20 ++++++++++++++++--- ...tence_transformers_sparse_text_embedder.py | 1 - ...tence_transformers_sparse_text_embedder.py | 10 +++++----- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index 78a4278158..f474d38a1c 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Literal, Optional +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.lazy_imports import LazyImport from haystack.utils.auth import Secret @@ -162,6 +163,19 @@ def __init__( # pylint: disable=too-many-positional-arguments backend=backend, ) - def embed(self, data: List[str], **kwargs) -> List[List[float]]: - embeddings = self.model.encode(data, **kwargs).tolist() - return embeddings + def embed(self, data: List[str], **kwargs) -> List[SparseEmbedding]: + embeddings = self.model.encode(data, **kwargs) + + sparse_embeddings = [] + + if isinstance(embeddings, list): + for embedding in embeddings: + sparse_embeddings.append( + SparseEmbedding(indices=embedding.indices.tolist(), values=embedding.values.tolist()) + ) + else: + sparse_embeddings.append( + SparseEmbedding(indices=embeddings.indices.tolist(), values=embeddings.values.tolist()) + ) + + return sparse_embeddings diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 388bbcf644..04a7948de7 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -179,7 +179,6 @@ def warm_up(self): tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs, backend=self.backend, - sparse=True, ) if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index dc5ca4c770..426b809eed 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -189,7 +189,7 @@ def test_from_dict_none_device(self): assert component.local_files_only is False @patch( - "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_text_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_warmup(self, mocked_factory): embedder = SentenceTransformersSparseTextEmbedder( @@ -214,7 +214,7 @@ def test_warmup(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_text_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_warmup_doesnt_reload(self, mocked_factory): embedder = SentenceTransformersSparseTextEmbedder(model="model") @@ -248,7 +248,7 @@ def test_run_wrong_input_format(self): embedder.run(text=list_integers_input) @patch( - "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_text_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_model_onnx_backend(self, mocked_factory): onnx_embedder = SentenceTransformersSparseTextEmbedder( @@ -275,7 +275,7 @@ def test_model_onnx_backend(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_text_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_model_openvino_backend(self, mocked_factory): openvino_embedder = SentenceTransformersSparseTextEmbedder( @@ -302,7 +302,7 @@ def test_model_openvino_backend(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_text_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_text_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) def test_dtype_on_gpu(self, mocked_factory, model_kwargs): From 4ddde7844e9ed93082d9df92e0101bded6f29262 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Fri, 22 Aug 2025 18:42:22 +0200 Subject: [PATCH 06/20] Return correct SparseEmbedding, imports and tests --- haystack/components/embedders/__init__.py | 8 ++++++ .../backends/sentence_transformers_backend.py | 25 +++++++++---------- ...e_transformers_sparse_document_embedder.py | 1 - ...sentence_transformers_embedding_backend.py | 5 ++++ ...e_transformers_sparse_document_embedder.py | 15 +++++++---- 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/haystack/components/embedders/__init__.py b/haystack/components/embedders/__init__.py index bb83fce5b8..a6c92ca3b3 100644 --- a/haystack/components/embedders/__init__.py +++ b/haystack/components/embedders/__init__.py @@ -16,6 +16,8 @@ "openai_text_embedder": ["OpenAITextEmbedder"], "sentence_transformers_document_embedder": ["SentenceTransformersDocumentEmbedder"], "sentence_transformers_text_embedder": ["SentenceTransformersTextEmbedder"], + "sentence_transformers_sparse_document_embedder": ["SentenceTransformersSparseDocumentEmbedder"], + "sentence_transformers_sparse_text_embedder": ["SentenceTransformersSparseTextEmbedder"], } if TYPE_CHECKING: @@ -28,6 +30,12 @@ from .sentence_transformers_document_embedder import ( SentenceTransformersDocumentEmbedder as SentenceTransformersDocumentEmbedder, ) + from .sentence_transformers_sparse_document_embedder import ( + SentenceTransformersSparseDocumentEmbedder as SentenceTransformersSparseDocumentEmbedder, + ) + from .sentence_transformers_sparse_text_embedder import ( + SentenceTransformersSparseTextEmbedder as SentenceTransformersSparseTextEmbedder, + ) from .sentence_transformers_text_embedder import ( SentenceTransformersTextEmbedder as SentenceTransformersTextEmbedder, ) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index f474d38a1c..0b15915fa4 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -164,18 +164,17 @@ def __init__( # pylint: disable=too-many-positional-arguments ) def embed(self, data: List[str], **kwargs) -> List[SparseEmbedding]: - embeddings = self.model.encode(data, **kwargs) - - sparse_embeddings = [] - - if isinstance(embeddings, list): - for embedding in embeddings: - sparse_embeddings.append( - SparseEmbedding(indices=embedding.indices.tolist(), values=embedding.values.tolist()) - ) - else: - sparse_embeddings.append( - SparseEmbedding(indices=embeddings.indices.tolist(), values=embeddings.values.tolist()) - ) + embeddings = self.model.encode(data, **kwargs).coalesce() + + rows, columns = embeddings.indices() + values = embeddings.values() + batch_size = embeddings.size(0) + + sparse_embeddings: List[SparseEmbedding] = [] + for embedding in range(batch_size): + mask = rows == embedding + embedding_columns = columns[mask].tolist() + embedding_values = values[mask].tolist() + sparse_embeddings.append(SparseEmbedding(indices=embedding_columns, values=embedding_values)) return sparse_embeddings diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index c4f80a068f..2a8d1d1dd5 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -186,7 +186,6 @@ def warm_up(self): tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs, backend=self.backend, - sparse=True, ) if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] diff --git a/test/components/embedders/test_sentence_transformers_embedding_backend.py b/test/components/embedders/test_sentence_transformers_embedding_backend.py index ce2e5c489c..552cc3df41 100644 --- a/test/components/embedders/test_sentence_transformers_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_embedding_backend.py @@ -5,6 +5,7 @@ from unittest.mock import patch import pytest +import torch from haystack.components.embedders.backends.sentence_transformers_backend import ( _SentenceTransformersEmbeddingBackendFactory, @@ -101,6 +102,10 @@ def test_embedding_function_with_kwargs(mock_sentence_transformer): @patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): + indices = torch.tensor([[0, 1], [1, 3]]) + values = torch.tensor([0.5, 0.7]) + mock_sparse_encoder.return_value.encode.return_value = torch.sparse_coo_tensor(indices, values, (2, 5)) + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") data = ["sentence1", "sentence2"] diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index 497e301474..839521c87c 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -14,6 +14,11 @@ ) from haystack.utils import ComponentDevice, Secret +from haystack.components.embedders.backends.sentence_transformers_backend import ( + _SentenceTransformersSparseEmbeddingBackendFactory, + _SentenceTransformersSparseEncoderEmbeddingBackend, +) + class TestSentenceTransformersDocumentEmbedder: def test_init_default(self): @@ -210,7 +215,7 @@ def test_from_dict_none_device(self): assert component.meta_fields_to_embed == ["meta_field"] @patch( - "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_document_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_warmup(self, mocked_factory): embedder = SentenceTransformersSparseDocumentEmbedder( @@ -236,7 +241,7 @@ def test_warmup(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_document_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_warmup_doesnt_reload(self, mocked_factory): embedder = SentenceTransformersSparseDocumentEmbedder(model="model") @@ -322,7 +327,7 @@ def test_prefix_suffix(self): ) @patch( - "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_document_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_model_onnx_backend(self, mocked_factory): onnx_embedder = SentenceTransformersSparseDocumentEmbedder( @@ -349,7 +354,7 @@ def test_model_onnx_backend(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_document_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) def test_model_openvino_backend(self, mocked_factory): openvino_embedder = SentenceTransformersSparseDocumentEmbedder( @@ -376,7 +381,7 @@ def test_model_openvino_backend(self, mocked_factory): ) @patch( - "haystack.components.embedders.sentence_transformers_document_embedder._SentenceTransformersEmbeddingBackendFactory" + "haystack.components.embedders.sentence_transformers_sparse_document_embedder._SentenceTransformersSparseEmbeddingBackendFactory" ) @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) def test_dtype_on_gpu(self, mocked_factory, model_kwargs): From 71950af581774e5f8119cb16c2ffef8fb0d3fc3c Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 28 Aug 2025 10:33:45 +0200 Subject: [PATCH 07/20] fix fmt --- .../backends/sentence_transformers_backend.py | 18 ++++----- ...e_transformers_sparse_document_embedder.py | 18 ++++----- ...tence_transformers_sparse_text_embedder.py | 16 ++++---- ...e_transformers_sparse_document_embedder.py | 37 ++++++++----------- ...tence_transformers_sparse_text_embedder.py | 23 +++++++----- 5 files changed, 55 insertions(+), 57 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index 1e118af009..97be663e27 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -62,7 +62,7 @@ class _SentenceTransformersSparseEmbeddingBackendFactory: Factory class to create instances of Sentence Transformers embedding backends. """ - _instances: Dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} + _instances: dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} @staticmethod def get_embedding_backend( # pylint: disable=too-many-positional-arguments @@ -71,9 +71,9 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments auth_token: Optional[Secret] = None, trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", ): embedding_backend_id = f"{model}{device}{auth_token}{backend}" @@ -149,9 +149,9 @@ def __init__( # pylint: disable=too-many-positional-arguments auth_token: Optional[Secret] = None, trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", ): sentence_transformers_import.check() @@ -168,14 +168,14 @@ def __init__( # pylint: disable=too-many-positional-arguments backend=backend, ) - def embed(self, data: List[str], **kwargs) -> List[SparseEmbedding]: + def embed(self, data: list[str], **kwargs) -> list[SparseEmbedding]: embeddings = self.model.encode(data, **kwargs).coalesce() rows, columns = embeddings.indices() values = embeddings.values() batch_size = embeddings.size(0) - sparse_embeddings: List[SparseEmbedding] = [] + sparse_embeddings: list[SparseEmbedding] = [] for embedding in range(batch_size): mask = rows == embedding embedding_columns = columns[mask].tolist() diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 2a8d1d1dd5..87d4159dc4 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -48,13 +48,13 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments suffix: str = "", batch_size: int = 32, progress_bar: bool = True, - meta_fields_to_embed: Optional[List[str]] = None, + meta_fields_to_embed: Optional[list[str]] = None, embedding_separator: str = "\n", trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", ): """ @@ -118,13 +118,13 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments self.embedding_backend: Optional[_SentenceTransformersSparseEncoderEmbeddingBackend] = None self.backend = backend - def _get_telemetry_data(self) -> Dict[str, Any]: + def _get_telemetry_data(self) -> dict[str, Any]: """ Data that is sent to Posthog for usage analytics. """ return {"model": self.model} - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. @@ -154,7 +154,7 @@ def to_dict(self) -> Dict[str, Any]: return serialization_dict @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersSparseDocumentEmbedder": + def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersSparseDocumentEmbedder": """ Deserializes the component from a dictionary. @@ -190,8 +190,8 @@ def warm_up(self): if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] - @component.output_types(documents=List[Document]) - def run(self, documents: List[Document]): + @component.output_types(documents=list[Document]) + def run(self, documents: list[Document]): """ Embed a list of documents. diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 04a7948de7..710f78dac9 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -46,10 +46,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments progress_bar: bool = True, trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, - encode_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, + encode_kwargs: Optional[dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", ): """ @@ -112,13 +112,13 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments self.embedding_backend: Optional[_SentenceTransformersSparseEncoderEmbeddingBackend] = None self.backend = backend - def _get_telemetry_data(self) -> Dict[str, Any]: + def _get_telemetry_data(self) -> dict[str, Any]: """ Data that is sent to Posthog for usage analytics. """ return {"model": self.model} - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """ Serializes the component to a dictionary. @@ -147,7 +147,7 @@ def to_dict(self) -> Dict[str, Any]: return serialization_dict @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "SentenceTransformersSparseTextEmbedder": + def from_dict(cls, data: dict[str, Any]) -> "SentenceTransformersSparseTextEmbedder": """ Deserializes the component from a dictionary. @@ -183,7 +183,7 @@ def warm_up(self): if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] - @component.output_types(embedding=List[float]) + @component.output_types(embedding=list[float]) def run(self, text: str): """ Embed a single string. diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index 839521c87c..f30a0c8184 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -9,14 +9,18 @@ import torch from haystack import Document +from haystack.components.embedders.backends.sentence_transformers_backend import ( + _SentenceTransformersSparseEmbeddingBackendFactory, + _SentenceTransformersSparseEncoderEmbeddingBackend, +) from haystack.components.embedders.sentence_transformers_sparse_document_embedder import ( SentenceTransformersSparseDocumentEmbedder, ) from haystack.utils import ComponentDevice, Secret -from haystack.components.embedders.backends.sentence_transformers_backend import ( - _SentenceTransformersSparseEmbeddingBackendFactory, - _SentenceTransformersSparseEncoderEmbeddingBackend, +TYPE_NAME = ( + "haystack.components.embedders.sentence_transformers_sparse_document_embedder." + "SentenceTransformersSparseDocumentEmbedder" ) @@ -65,7 +69,7 @@ def test_to_dict(self): component = SentenceTransformersSparseDocumentEmbedder(model="model", device=ComponentDevice.from_str("cpu")) data = component.to_dict() assert data == { - "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "type": TYPE_NAME, "init_parameters": { "model": "model", "device": ComponentDevice.from_str("cpu").to_dict(), @@ -105,7 +109,7 @@ def test_to_dict_with_custom_init_parameters(self): data = component.to_dict() assert data == { - "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", + "type": TYPE_NAME, "init_parameters": { "model": "model", "device": ComponentDevice.from_str("cuda:0").to_dict(), @@ -143,10 +147,7 @@ def test_from_dict(self): "config_kwargs": {"use_memory_efficient_attention": True}, } component = SentenceTransformersSparseDocumentEmbedder.from_dict( - { - "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", - "init_parameters": init_parameters, - } + {"type": TYPE_NAME, "init_parameters": init_parameters} ) assert component.model == "model" assert component.device == ComponentDevice.from_str("cuda:0") @@ -164,12 +165,7 @@ def test_from_dict(self): assert component.config_kwargs == {"use_memory_efficient_attention": True} def test_from_dict_no_default_parameters(self): - component = SentenceTransformersSparseDocumentEmbedder.from_dict( - { - "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", - "init_parameters": {}, - } - ) + component = SentenceTransformersSparseDocumentEmbedder.from_dict({"type": TYPE_NAME, "init_parameters": {}}) assert component.model == "naver/splade-cocondenser-ensembledistil" assert component.device == ComponentDevice.resolve_device(None) assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) @@ -197,10 +193,7 @@ def test_from_dict_none_device(self): "local_files_only": False, } component = SentenceTransformersSparseDocumentEmbedder.from_dict( - { - "type": "haystack.components.embedders.sentence_transformers_sparse_document_embedder.SentenceTransformersSparseDocumentEmbedder", - "init_parameters": init_parameters, - } + {"type": TYPE_NAME, "init_parameters": init_parameters} ) assert component.model == "model" assert component.device == ComponentDevice.resolve_device(None) @@ -336,7 +329,8 @@ def test_model_onnx_backend(self, mocked_factory): device=ComponentDevice.from_str("cpu"), model_kwargs={ "file_name": "onnx/model.onnx" - }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to prevent a HF warning + }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to + # prevent a HF warning backend="onnx", ) onnx_embedder.warm_up() @@ -363,7 +357,8 @@ def test_model_openvino_backend(self, mocked_factory): device=ComponentDevice.from_str("cpu"), model_kwargs={ "file_name": "openvino/openvino_model.xml" - }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but this is to prevent a HF warning + }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but this + # is to prevent a HF warning backend="openvino", ) openvino_embedder.warm_up() diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index 426b809eed..5ae006a3f9 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -13,6 +13,10 @@ ) from haystack.utils import ComponentDevice, Secret +TYPE_NAME = ( + "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder" +) + class TestSentenceTransformersSparseTextEmbedder: def test_init_default(self): @@ -53,7 +57,7 @@ def test_to_dict(self): component = SentenceTransformersSparseTextEmbedder(model="model", device=ComponentDevice.from_str("cpu")) data = component.to_dict() assert data == { - "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "type": TYPE_NAME, "init_parameters": { "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, "model": "model", @@ -90,7 +94,7 @@ def test_to_dict_with_custom_init_parameters(self): ) data = component.to_dict() assert data == { - "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "type": TYPE_NAME, "init_parameters": { "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, "model": "model", @@ -116,7 +120,7 @@ def test_to_dict_not_serialize_token(self): def test_from_dict(self): data = { - "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "type": TYPE_NAME, "init_parameters": { "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, "model": "model", @@ -147,10 +151,7 @@ def test_from_dict(self): assert component.config_kwargs == {"use_memory_efficient_attention": False} def test_from_dict_no_default_parameters(self): - data = { - "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", - "init_parameters": {}, - } + data = {"type": TYPE_NAME, "init_parameters": {}} component = SentenceTransformersSparseTextEmbedder.from_dict(data) assert component.model == "naver/splade-cocondenser-ensembledistil" assert component.device == ComponentDevice.resolve_device(None) @@ -164,7 +165,7 @@ def test_from_dict_no_default_parameters(self): def test_from_dict_none_device(self): data = { - "type": "haystack.components.embedders.sentence_transformers_sparse_text_embedder.SentenceTransformersSparseTextEmbedder", + "type": TYPE_NAME, "init_parameters": { "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, "model": "model", @@ -257,7 +258,8 @@ def test_model_onnx_backend(self, mocked_factory): device=ComponentDevice.from_str("cpu"), model_kwargs={ "file_name": "onnx/model.onnx" - }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to prevent a HF warning + }, # setting the path isn't necessary if the repo contains a "onnx/model.onnx" file but this is to + # prevent a HF warning backend="onnx", ) onnx_embedder.warm_up() @@ -284,7 +286,8 @@ def test_model_openvino_backend(self, mocked_factory): device=ComponentDevice.from_str("cpu"), model_kwargs={ "file_name": "openvino/openvino_model.xml" - }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but this is to prevent a HF warning + }, # setting the path isn't necessary if the repo contains a "openvino/openvino_model.xml" file but + # this is to prevent a HF warning backend="openvino", ) openvino_embedder.warm_up() From a469c8f863d73a8fc7a4aba9c9e6d519a92daced Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sat, 6 Sep 2025 16:09:17 +0200 Subject: [PATCH 08/20] Style changes and fixes --- .../backends/sentence_transformers_backend.py | 89 --------------- .../sentence_transformers_sparse_backend.py | 105 ++++++++++++++++++ ...e_transformers_sparse_document_embedder.py | 2 +- ...tence_transformers_sparse_text_embedder.py | 5 +- ...sentence_transformers_embedding_backend.py | 52 --------- ...e_transformers_sparse_embedding_backend.py | 66 +++++++++++ 6 files changed, 175 insertions(+), 144 deletions(-) create mode 100644 haystack/components/embedders/backends/sentence_transformers_sparse_backend.py create mode 100644 test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index 97be663e27..b06de0794b 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -4,7 +4,6 @@ from typing import Any, Literal, Optional, Union -from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.lazy_imports import LazyImport from haystack.utils.auth import Secret @@ -57,46 +56,6 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments return embedding_backend -class _SentenceTransformersSparseEmbeddingBackendFactory: - """ - Factory class to create instances of Sentence Transformers embedding backends. - """ - - _instances: dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} - - @staticmethod - def get_embedding_backend( # pylint: disable=too-many-positional-arguments - model: str, - device: Optional[str] = None, - auth_token: Optional[Secret] = None, - trust_remote_code: bool = False, - local_files_only: bool = False, - model_kwargs: Optional[dict[str, Any]] = None, - tokenizer_kwargs: Optional[dict[str, Any]] = None, - config_kwargs: Optional[dict[str, Any]] = None, - backend: Literal["torch", "onnx", "openvino"] = "torch", - ): - embedding_backend_id = f"{model}{device}{auth_token}{backend}" - - if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances: - return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] - - embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend( - model=model, - device=device, - auth_token=auth_token, - trust_remote_code=trust_remote_code, - local_files_only=local_files_only, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - config_kwargs=config_kwargs, - backend=backend, - ) - - _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend - return embedding_backend - - class _SentenceTransformersEmbeddingBackend: """ Class to manage Sentence Transformers embeddings. @@ -135,51 +94,3 @@ def embed(self, data: Union[list[str], list["Image"]], **kwargs: Any) -> list[li # https://sbert.net/examples/sentence_transformer/applications/image-search embeddings = self.model.encode(data, **kwargs).tolist() # type: ignore[arg-type] return embeddings - - -class _SentenceTransformersSparseEncoderEmbeddingBackend: - """ - Class to manage Sparse embeddings from Sentence Transformers. - """ - - def __init__( # pylint: disable=too-many-positional-arguments - self, - model: str, - device: Optional[str] = None, - auth_token: Optional[Secret] = None, - trust_remote_code: bool = False, - local_files_only: bool = False, - model_kwargs: Optional[dict[str, Any]] = None, - tokenizer_kwargs: Optional[dict[str, Any]] = None, - config_kwargs: Optional[dict[str, Any]] = None, - backend: Literal["torch", "onnx", "openvino"] = "torch", - ): - sentence_transformers_import.check() - - self.model = SparseEncoder( - model_name_or_path=model, - device=device, - token=auth_token.resolve_value() if auth_token else None, - trust_remote_code=trust_remote_code, - local_files_only=local_files_only, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - config_kwargs=config_kwargs, - backend=backend, - ) - - def embed(self, data: list[str], **kwargs) -> list[SparseEmbedding]: - embeddings = self.model.encode(data, **kwargs).coalesce() - - rows, columns = embeddings.indices() - values = embeddings.values() - batch_size = embeddings.size(0) - - sparse_embeddings: list[SparseEmbedding] = [] - for embedding in range(batch_size): - mask = rows == embedding - embedding_columns = columns[mask].tolist() - embedding_values = values[mask].tolist() - sparse_embeddings.append(SparseEmbedding(indices=embedding_columns, values=embedding_values)) - - return sparse_embeddings diff --git a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py new file mode 100644 index 0000000000..d6d6ebb1f4 --- /dev/null +++ b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Literal, Optional + +from haystack.dataclasses.sparse_embedding import SparseEmbedding +from haystack.lazy_imports import LazyImport +from haystack.utils.auth import Secret + +with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as sentence_transformers_import: + from sentence_transformers import SentenceTransformer, SparseEncoder + + +class _SentenceTransformersSparseEmbeddingBackendFactory: + """ + Factory class to create instances of Sentence Transformers embedding backends. + """ + + _instances: dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} + + @staticmethod + def get_embedding_backend( # pylint: disable=too-many-positional-arguments + *, + model: str, + device: Optional[str] = None, + auth_token: Optional[Secret] = None, + trust_remote_code: bool = False, + local_files_only: bool = False, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + embedding_backend_id = f"{model}{device}{auth_token}{backend}" + + if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances: + return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] + + embedding_backend = _SentenceTransformersSparseEncoderEmbeddingBackend( + model=model, + device=device, + auth_token=auth_token, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) + + _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] = embedding_backend + return embedding_backend + + +class _SentenceTransformersSparseEncoderEmbeddingBackend: + """ + Class to manage Sparse embeddings from Sentence Transformers. + """ + + def __init__( # pylint: disable=too-many-positional-arguments + self, + *, + model: str, + device: Optional[str] = None, + auth_token: Optional[Secret] = None, + trust_remote_code: bool = False, + local_files_only: bool = False, + model_kwargs: Optional[dict[str, Any]] = None, + tokenizer_kwargs: Optional[dict[str, Any]] = None, + config_kwargs: Optional[dict[str, Any]] = None, + backend: Literal["torch", "onnx", "openvino"] = "torch", + ): + sentence_transformers_import.check() + + self.model = SparseEncoder( + model_name_or_path=model, + device=device, + token=auth_token.resolve_value() if auth_token else None, + trust_remote_code=trust_remote_code, + local_files_only=local_files_only, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + config_kwargs=config_kwargs, + backend=backend, + ) + + def embed(self, *, data: list[str], **kwargs) -> list[SparseEmbedding]: + embeddings = self.model.encode(data, **kwargs).coalesce() + + # Extract the row indices, column indices, values, and batch size from the sparse tensor embeddings + rows, columns = embeddings.indices() + values = embeddings.values() + batch_size = embeddings.size(0) + + sparse_embeddings: list[SparseEmbedding] = [] + for embedding in range(batch_size): + # For each embedding in the batch, create a mask to select its corresponding indices and values + mask = rows == embedding + # Extract the column indices and values for the current embedding in the batch + embedding_columns = columns[mask].tolist() + embedding_values = values[mask].tolist() + sparse_embeddings.append(SparseEmbedding(indices=embedding_columns, values=embedding_values)) + + return sparse_embeddings diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 87d4159dc4..fef9578f2a 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -225,6 +225,6 @@ def run(self, documents: list[Document]): ) for doc, emb in zip(documents, embeddings): - doc.embedding = emb + doc.sparse_embedding = emb return {"documents": documents} diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 710f78dac9..7376f88913 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -9,6 +9,7 @@ _SentenceTransformersSparseEmbeddingBackendFactory, _SentenceTransformersSparseEncoderEmbeddingBackend, ) +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs @@ -183,7 +184,7 @@ def warm_up(self): if self.tokenizer_kwargs and self.tokenizer_kwargs.get("model_max_length"): self.embedding_backend.model.max_seq_length = self.tokenizer_kwargs["model_max_length"] - @component.output_types(embedding=list[float]) + @component.output_types(sparse_embedding=SparseEmbedding) def run(self, text: str): """ Embed a single string. @@ -211,4 +212,4 @@ def run(self, text: str): show_progress_bar=self.progress_bar, **(self.encode_kwargs if self.encode_kwargs else {}), )[0] - return {"embedding": embedding} + return {"sparse_embedding": embedding} diff --git a/test/components/embedders/test_sentence_transformers_embedding_backend.py b/test/components/embedders/test_sentence_transformers_embedding_backend.py index 552cc3df41..db9c1d8113 100644 --- a/test/components/embedders/test_sentence_transformers_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_embedding_backend.py @@ -9,7 +9,6 @@ from haystack.components.embedders.backends.sentence_transformers_backend import ( _SentenceTransformersEmbeddingBackendFactory, - _SentenceTransformersSparseEmbeddingBackendFactory, ) from haystack.utils.auth import Secret @@ -28,20 +27,6 @@ def test_factory_behavior(mock_sentence_transformer): assert another_embedding_backend is not embedding_backend -@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") -def test_sparse_factory_behavior(mock_sparse_encoder): - embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( - model="my_model", device="cpu" - ) - same_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend("my_model", "cpu") - another_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( - model="another_model", device="cpu" - ) - - assert same_embedding_backend is embedding_backend - assert another_embedding_backend is not embedding_backend - - @patch("haystack.components.embedders.backends.sentence_transformers_backend.SentenceTransformer") def test_model_initialization(mock_sentence_transformer): _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend( @@ -67,29 +52,6 @@ def test_model_initialization(mock_sentence_transformer): ) -@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") -def test_sparse_model_initialization(mock_sparse_encoder): - _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( - model="model", - device="cpu", - auth_token=Secret.from_token("fake-api-token"), - trust_remote_code=True, - local_files_only=True, - backend="torch", - ) - mock_sparse_encoder.assert_called_once_with( - model_name_or_path="model", - device="cpu", - token="fake-api-token", - trust_remote_code=True, - local_files_only=True, - model_kwargs=None, - tokenizer_kwargs=None, - config_kwargs=None, - backend="torch", - ) - - @patch("haystack.components.embedders.backends.sentence_transformers_backend.SentenceTransformer") def test_embedding_function_with_kwargs(mock_sentence_transformer): embedding_backend = _SentenceTransformersEmbeddingBackendFactory.get_embedding_backend(model="model") @@ -98,17 +60,3 @@ def test_embedding_function_with_kwargs(mock_sentence_transformer): embedding_backend.embed(data=data, normalize_embeddings=True) embedding_backend.model.encode.assert_called_once_with(data, normalize_embeddings=True) - - -@patch("haystack.components.embedders.backends.sentence_transformers_backend.SparseEncoder") -def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): - indices = torch.tensor([[0, 1], [1, 3]]) - values = torch.tensor([0.5, 0.7]) - mock_sparse_encoder.return_value.encode.return_value = torch.sparse_coo_tensor(indices, values, (2, 5)) - - embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") - - data = ["sentence1", "sentence2"] - embedding_backend.embed(data=data, attn_implementation="sdpa") - - embedding_backend.model.encode.assert_called_once_with(data, attn_implementation="sdpa") diff --git a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py new file mode 100644 index 0000000000..c20f491c1f --- /dev/null +++ b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch + +import pytest +import torch + +from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( + _SentenceTransformersSparseEmbeddingBackendFactory, +) +from haystack.utils.auth import Secret + + +@patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") +def test_sparse_factory_behavior(mock_sparse_encoder): + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="my_model", device="cpu" + ) + same_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="my_model", device="cpu" + ) + another_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="another_model", device="cpu" + ) + + assert same_embedding_backend is embedding_backend + assert another_embedding_backend is not embedding_backend + + +@patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") +def test_sparse_model_initialization(mock_sparse_encoder): + _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="model", + device="cpu", + auth_token=Secret.from_token("fake-api-token"), + trust_remote_code=True, + local_files_only=True, + backend="torch", + ) + mock_sparse_encoder.assert_called_once_with( + model_name_or_path="model", + device="cpu", + token="fake-api-token", + trust_remote_code=True, + local_files_only=True, + model_kwargs=None, + tokenizer_kwargs=None, + config_kwargs=None, + backend="torch", + ) + + +@patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") +def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): + indices = torch.tensor([[0, 1], [1, 3]]) + values = torch.tensor([0.5, 0.7]) + mock_sparse_encoder.return_value.encode.return_value = torch.sparse_coo_tensor(indices, values, (2, 5)) + + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") + + data = ["sentence1", "sentence2"] + embedding_backend.embed(data=data, attn_implementation="sdpa") + + embedding_backend.model.encode.assert_called_once_with(data, attn_implementation="sdpa") From be29552561be9eedffaa073f02c445f6d5cb10f7 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 14 Sep 2025 13:08:47 +0200 Subject: [PATCH 09/20] Added a test for embed function --- ...e_transformers_sparse_embedding_backend.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py index c20f491c1f..85962d63d5 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py @@ -10,6 +10,7 @@ from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( _SentenceTransformersSparseEmbeddingBackendFactory, ) +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.utils.auth import Secret @@ -64,3 +65,37 @@ def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): embedding_backend.embed(data=data, attn_implementation="sdpa") embedding_backend.model.encode.assert_called_once_with(data, attn_implementation="sdpa") + + +@patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") +def test_sparse_embedding_function(mock_sparse_encoder): + """ + Test that the backend's embed method returns the correct sparse embeddings. + """ + + # Ensure the factory cache is cleared before each test. + _SentenceTransformersSparseEmbeddingBackendFactory._instances = {} + + # Mocking the sparse tensor output from the model's encode method + indices = torch.tensor([[0, 0, 1], [1, 4, 2]]) # (row, col) indices for a batch of 2 + values = torch.tensor([0.5, 0.8, 0.3]) + shape = (2, 5) # Batch size of 2, dimension of 5 + mock_sparse_tensor = torch.sparse_coo_tensor(indices, values, shape) + mock_sparse_encoder.return_value.encode.return_value = mock_sparse_tensor + + # Get the embedding backend + embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") + + # Embed dummy data + data = ["sentence1", "sentence2"] + sparse_embeddings = embedding_backend.embed(data=data) + # Expected output + expected_embeddings = [ + SparseEmbedding(indices=[1, 4], values=[0.5, 0.8]), + SparseEmbedding(indices=[2], values=[0.3]), + ] + + assert len(sparse_embeddings) == len(expected_embeddings) + for got, exp in zip(sparse_embeddings, expected_embeddings): + assert got.indices == exp.indices + assert got.values == pytest.approx(exp.values) From f7536f94dcf1c5000babec6bf3ab1aa9d53a5204 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 14 Sep 2025 13:57:38 +0200 Subject: [PATCH 10/20] Added integration test and fixed some other tests --- ...e_transformers_sparse_document_embedder.py | 2 +- ...tence_transformers_sparse_text_embedder.py | 2 +- ...e_transformers_sparse_document_embedder.py | 45 ++++++++++++++++--- ...tence_transformers_sparse_text_embedder.py | 30 +++++++++++-- 4 files changed, 66 insertions(+), 13 deletions(-) diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index fef9578f2a..8bafe0794b 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Literal, Optional from haystack import Document, component, default_from_dict, default_to_dict -from haystack.components.embedders.backends.sentence_transformers_backend import ( +from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( _SentenceTransformersSparseEmbeddingBackendFactory, _SentenceTransformersSparseEncoderEmbeddingBackend, ) diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 7376f88913..1c94201ef6 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Literal, Optional from haystack import component, default_from_dict, default_to_dict -from haystack.components.embedders.backends.sentence_transformers_backend import ( +from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( _SentenceTransformersSparseEmbeddingBackendFactory, _SentenceTransformersSparseEncoderEmbeddingBackend, ) diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index f30a0c8184..a1ac8e43d1 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -9,13 +9,10 @@ import torch from haystack import Document -from haystack.components.embedders.backends.sentence_transformers_backend import ( - _SentenceTransformersSparseEmbeddingBackendFactory, - _SentenceTransformersSparseEncoderEmbeddingBackend, -) from haystack.components.embedders.sentence_transformers_sparse_document_embedder import ( SentenceTransformersSparseDocumentEmbedder, ) +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.utils import ComponentDevice, Secret TYPE_NAME = ( @@ -247,7 +244,7 @@ def test_run(self): embedder = SentenceTransformersSparseDocumentEmbedder(model="model") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: [ - [random.random() for _ in range(16)] for _ in range(len(x)) + SparseEmbedding(indices=[0, 2, 5], values=[0.1, 0.2, 0.3]) for _ in range(len(x)) ] documents = [Document(content=f"document number {i}") for i in range(5)] @@ -258,8 +255,9 @@ def test_run(self): assert len(result["documents"]) == len(documents) for doc in result["documents"]: assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert isinstance(doc.embedding[0], float) + assert isinstance(doc.sparse_embedding, SparseEmbedding) + assert isinstance(doc.sparse_embedding.indices[0], int) + assert isinstance(doc.sparse_embedding.values[0], float) def test_run_wrong_input_format(self): embedder = SentenceTransformersSparseDocumentEmbedder(model="model") @@ -399,3 +397,36 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): config_kwargs=None, backend="torch", ) + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_sparse_document_embedder(self): + pytest.importorskip("sentence_transformers", reason="sentence-transformers is required for this test") + + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + ] + + embedder = SentenceTransformersSparseDocumentEmbedder( + model="naver/splade-cocondenser-ensembledistil", + meta_fields_to_embed=["topic"], + embedding_separator=" | ", + device=ComponentDevice.from_str("cpu"), + ) + embedder.warm_up() + result = embedder.run(documents=docs) + documents_with_embeddings = result["documents"] + + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert hasattr(doc, "sparse_embedding") + assert isinstance(doc.sparse_embedding, SparseEmbedding) + assert isinstance(doc.sparse_embedding.indices, list) + assert isinstance(doc.sparse_embedding.values, list) + assert len(doc.sparse_embedding.indices) == len(doc.sparse_embedding.values) + # Expect at least one non-zero entry + assert len(doc.sparse_embedding.indices) > 0 diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index 5ae006a3f9..744c28c5db 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -11,6 +11,7 @@ from haystack.components.embedders.sentence_transformers_sparse_text_embedder import ( SentenceTransformersSparseTextEmbedder, ) +from haystack.dataclasses.sparse_embedding import SparseEmbedding from haystack.utils import ComponentDevice, Secret TYPE_NAME = ( @@ -228,16 +229,17 @@ def test_run(self): embedder = SentenceTransformersSparseTextEmbedder(model="model") embedder.embedding_backend = MagicMock() embedder.embedding_backend.embed = lambda x, **kwargs: [ - [random.random() for _ in range(16)] for _ in range(len(x)) + SparseEmbedding(indices=[1, 3], values=[0.5, 0.7]) for _ in range(len(x)) ] text = "a nice text to embed" result = embedder.run(text=text) - embedding = result["embedding"] + sparse_embedding = result["sparse_embedding"] - assert isinstance(embedding, list) - assert all(isinstance(el, float) for el in embedding) + assert isinstance(sparse_embedding, SparseEmbedding) + assert sparse_embedding.indices == [1, 3] + assert sparse_embedding.values == [0.5, 0.7] def test_run_wrong_input_format(self): embedder = SentenceTransformersSparseTextEmbedder(model="model") @@ -328,3 +330,23 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): config_kwargs=None, backend="torch", ) + + @pytest.mark.integration + @pytest.mark.slow + @pytest.mark.flaky(reruns=3, reruns_delay=10) + def test_live_run_sparse_text_embedder(self): + pytest.importorskip("sentence_transformers", reason="sentence-transformers is required for this test") + + text = "I love Nine Inch Nails" + embedder = SentenceTransformersSparseTextEmbedder( + model="naver/splade-cocondenser-ensembledistil", device=ComponentDevice.from_str("cpu") + ) + embedder.warm_up() + result = embedder.run(text=text) + sparse_embedding = result["sparse_embedding"] + + assert isinstance(sparse_embedding, SparseEmbedding) + assert isinstance(sparse_embedding.indices, list) + assert isinstance(sparse_embedding.values, list) + assert len(sparse_embedding.indices) == len(sparse_embedding.values) + assert len(sparse_embedding.indices) > 0 From 90dd503b834f4dd81034b8065250c004574cb1ae Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 14 Sep 2025 18:00:47 +0200 Subject: [PATCH 11/20] Add lint fixes --- .../backends/sentence_transformers_sparse_backend.py | 2 +- .../sentence_transformers_sparse_document_embedder.py | 2 +- .../embedders/sentence_transformers_sparse_text_embedder.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py index d6d6ebb1f4..e9579438c9 100644 --- a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py @@ -86,7 +86,7 @@ def __init__( # pylint: disable=too-many-positional-arguments ) def embed(self, *, data: list[str], **kwargs) -> list[SparseEmbedding]: - embeddings = self.model.encode(data, **kwargs).coalesce() + embeddings = self.model.encode(data, **kwargs).coalesce() # type: ignore[attr-defined] # Extract the row indices, column indices, values, and batch size from the sparse tensor embeddings rows, columns = embeddings.indices() diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 8bafe0794b..ea0af6e0c8 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -221,7 +221,7 @@ def run(self, documents: list[Document]): texts_to_embed.append(text_to_embed) embeddings = self.embedding_backend.embed( - texts_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar + data=texts_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar ) for doc, emb in zip(documents, embeddings): diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 1c94201ef6..600ebe77bd 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -206,10 +206,12 @@ def run(self, text: str): raise RuntimeError("The embedding model has not been loaded. Please call warm_up() before running.") text_to_embed = self.prefix + text + self.suffix + embedding = self.embedding_backend.embed( - [text_to_embed], + data=[text_to_embed], batch_size=self.batch_size, show_progress_bar=self.progress_bar, **(self.encode_kwargs if self.encode_kwargs else {}), )[0] + return {"sparse_embedding": embedding} From 21313ce341c0a81d147d21b2e9093eac72fbeb86 Mon Sep 17 00:00:00 2001 From: ryzhtus Date: Sun, 14 Sep 2025 18:33:15 +0200 Subject: [PATCH 12/20] Fixed positional arguments --- ...sentence_transformers_sparse_document_embedder.py | 12 +++++++----- ...est_sentence_transformers_sparse_text_embedder.py | 8 +++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index a1ac8e43d1..6b3d1fd05d 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -243,9 +243,11 @@ def test_warmup_doesnt_reload(self, mocked_factory): def test_run(self): embedder = SentenceTransformersSparseDocumentEmbedder(model="model") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: [ - SparseEmbedding(indices=[0, 2, 5], values=[0.1, 0.2, 0.3]) for _ in range(len(x)) - ] + + def fake_embed(data, **kwargs): + return [SparseEmbedding(indices=[0, 2, 5], values=[0.1, 0.2, 0.3]) for _ in range(len(data))] + + embedder.embedding_backend.embed = fake_embed documents = [Document(content=f"document number {i}") for i in range(5)] @@ -283,7 +285,7 @@ def test_embed_metadata(self): documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] embedder.run(documents=documents) embedder.embedding_backend.embed.assert_called_once_with( - [ + data=[ "meta_value 0\ndocument number 0", "meta_value 1\ndocument number 1", "meta_value 2\ndocument number 2", @@ -306,7 +308,7 @@ def test_prefix_suffix(self): documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] embedder.run(documents=documents) embedder.embedding_backend.embed.assert_called_once_with( - [ + data=[ "my_prefix meta_value 0\ndocument number 0 my_suffix", "my_prefix meta_value 1\ndocument number 1 my_suffix", "my_prefix meta_value 2\ndocument number 2 my_suffix", diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index 744c28c5db..3e1fbe66a8 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -228,9 +228,11 @@ def test_warmup_doesnt_reload(self, mocked_factory): def test_run(self): embedder = SentenceTransformersSparseTextEmbedder(model="model") embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: [ - SparseEmbedding(indices=[1, 3], values=[0.5, 0.7]) for _ in range(len(x)) - ] + + def fake_embed(data, **kwargs): + return [SparseEmbedding(indices=[1, 3], values=[0.5, 0.7]) for _ in range(len(data))] + + embedder.embedding_backend.embed = fake_embed text = "a nice text to embed" From 60e2805dc5f2ba57f89471cd758db84fba613286 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 18 Sep 2025 18:26:27 +0200 Subject: [PATCH 13/20] fix types, simplify and more --- .github/workflows/slow.yml | 4 ++++ ...sentence_transformers_embedding_backend.py | 3 --- ...e_transformers_sparse_document_embedder.py | 8 ++++--- ...e_transformers_sparse_embedding_backend.py | 22 +++++++++---------- ...tence_transformers_sparse_text_embedder.py | 10 +++++---- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 4e75e81754..5d9f03f4ad 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -62,6 +62,8 @@ jobs: - "haystack/components/embedders/hugging_face_api_text_embedder.py" - "haystack/components/embedders/image/sentence_transformers_doc_image_embedder.py" - "haystack/components/embedders/sentence_transformers_text_embedder.py" + - "haystack/components/embedders/sentence_transformers_sparse_document_embedder.py" + - "haystack/components/embedders/sentence_transformers_sparse_text_embedder.py" - "haystack/components/evaluators/sas_evaluator.py" - "haystack/components/generators/chat/hugging_face_api.py" - "haystack/components/generators/chat/hugging_face_local.py" @@ -81,6 +83,8 @@ jobs: - "test/components/embedders/test_hugging_face_api_text_embedder.py" - "test/components/embedders/image/test_sentence_transformers_doc_image_embedder.py" - "test/components/embedders/test_sentence_transformers_text_embedder.py" + - "test/components/embedders/test_sentence_transformers_sparse_document_embedder.py" + - "test/components/embedders/test_sentence_transformers_sparse_text_embedder.py" - "test/components/evaluators/test_sas_evaluator.py" - "test/components/generators/chat/test_hugging_face_api.py" - "test/components/generators/chat/test_hugging_face_local.py" diff --git a/test/components/embedders/test_sentence_transformers_embedding_backend.py b/test/components/embedders/test_sentence_transformers_embedding_backend.py index db9c1d8113..f869c23e35 100644 --- a/test/components/embedders/test_sentence_transformers_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_embedding_backend.py @@ -4,9 +4,6 @@ from unittest.mock import patch -import pytest -import torch - from haystack.components.embedders.backends.sentence_transformers_backend import ( _SentenceTransformersEmbeddingBackendFactory, ) diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index 6b3d1fd05d..f246ff22d5 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import random from unittest.mock import MagicMock, patch import pytest @@ -277,6 +276,11 @@ def test_run_wrong_input_format(self): ): embedder.run(documents=list_integers_input) + def test_run_no_warmup(self): + embedder = SentenceTransformersSparseDocumentEmbedder(model="model") + with pytest.raises(RuntimeError, match="The embedding model has not been loaded."): + embedder.run(documents=[Document(content="test")]) + def test_embed_metadata(self): embedder = SentenceTransformersSparseDocumentEmbedder( model="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n" @@ -404,8 +408,6 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) def test_live_run_sparse_document_embedder(self): - pytest.importorskip("sentence_transformers", reason="sentence-transformers is required for this test") - docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), diff --git a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py index 85962d63d5..05f6921dc3 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py @@ -55,16 +55,14 @@ def test_sparse_model_initialization(mock_sparse_encoder): @patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") def test_sparse_embedding_function_with_kwargs(mock_sparse_encoder): - indices = torch.tensor([[0, 1], [1, 3]]) - values = torch.tensor([0.5, 0.7]) - mock_sparse_encoder.return_value.encode.return_value = torch.sparse_coo_tensor(indices, values, (2, 5)) - embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") data = ["sentence1", "sentence2"] embedding_backend.embed(data=data, attn_implementation="sdpa") - embedding_backend.model.encode.assert_called_once_with(data, attn_implementation="sdpa") + embedding_backend.model.encode.assert_called_once_with( + data, convert_to_tensor=False, convert_to_sparse_tensor=True, attn_implementation="sdpa" + ) @patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") @@ -76,12 +74,13 @@ def test_sparse_embedding_function(mock_sparse_encoder): # Ensure the factory cache is cleared before each test. _SentenceTransformersSparseEmbeddingBackendFactory._instances = {} - # Mocking the sparse tensor output from the model's encode method - indices = torch.tensor([[0, 0, 1], [1, 4, 2]]) # (row, col) indices for a batch of 2 - values = torch.tensor([0.5, 0.8, 0.3]) - shape = (2, 5) # Batch size of 2, dimension of 5 - mock_sparse_tensor = torch.sparse_coo_tensor(indices, values, shape) - mock_sparse_encoder.return_value.encode.return_value = mock_sparse_tensor + # size = (5,) + + tensors = [ + torch.sparse_coo_tensor(torch.tensor([[1, 4]]), torch.tensor([0.5, 0.8])), + torch.sparse_coo_tensor(torch.tensor([[2]]), torch.tensor([0.3])), + ] + mock_sparse_encoder.return_value.encode.return_value = tensors # Get the embedding backend embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend(model="model") @@ -89,6 +88,7 @@ def test_sparse_embedding_function(mock_sparse_encoder): # Embed dummy data data = ["sentence1", "sentence2"] sparse_embeddings = embedding_backend.embed(data=data) + # Expected output expected_embeddings = [ SparseEmbedding(indices=[1, 4], values=[0.5, 0.8]), diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index 3e1fbe66a8..be93423303 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import random from unittest.mock import MagicMock, patch import pytest @@ -202,7 +201,7 @@ def test_warmup(self, mocked_factory): ) mocked_factory.get_embedding_backend.assert_not_called() embedder.warm_up() - embedder.embedding_backend.model.max_seq_length = 512 + mocked_factory.get_embedding_backend.assert_called_once_with( model="model", device="cpu", @@ -225,6 +224,11 @@ def test_warmup_doesnt_reload(self, mocked_factory): embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once() + def test_run_no_warmup(self): + embedder = SentenceTransformersSparseTextEmbedder(model="model") + with pytest.raises(RuntimeError, match="The embedding model has not been loaded"): + embedder.run(text="a nice text to embed") + def test_run(self): embedder = SentenceTransformersSparseTextEmbedder(model="model") embedder.embedding_backend = MagicMock() @@ -337,8 +341,6 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) def test_live_run_sparse_text_embedder(self): - pytest.importorskip("sentence_transformers", reason="sentence-transformers is required for this test") - text = "I love Nine Inch Nails" embedder = SentenceTransformersSparseTextEmbedder( model="naver/splade-cocondenser-ensembledistil", device=ComponentDevice.from_str("cpu") From 3620d09dea0ebad8af692874b33492bda4594669 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 18 Sep 2025 18:28:24 +0200 Subject: [PATCH 14/20] fix --- .../sentence_transformers_sparse_backend.py | 25 +++++++++---------- ...e_transformers_sparse_document_embedder.py | 8 +++--- ...e_transformers_sparse_embedding_backend.py | 2 -- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py index e9579438c9..c44fb5addc 100644 --- a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py @@ -86,20 +86,19 @@ def __init__( # pylint: disable=too-many-positional-arguments ) def embed(self, *, data: list[str], **kwargs) -> list[SparseEmbedding]: - embeddings = self.model.encode(data, **kwargs).coalesce() # type: ignore[attr-defined] - - # Extract the row indices, column indices, values, and batch size from the sparse tensor embeddings - rows, columns = embeddings.indices() - values = embeddings.values() - batch_size = embeddings.size(0) + embeddings_list = self.model.encode( + data, + convert_to_tensor=False, # output is a list of individual tensors + convert_to_sparse_tensor=True, + **kwargs, + ) sparse_embeddings: list[SparseEmbedding] = [] - for embedding in range(batch_size): - # For each embedding in the batch, create a mask to select its corresponding indices and values - mask = rows == embedding - # Extract the column indices and values for the current embedding in the batch - embedding_columns = columns[mask].tolist() - embedding_values = values[mask].tolist() - sparse_embeddings.append(SparseEmbedding(indices=embedding_columns, values=embedding_values)) + for embedding_tensor in embeddings_list: + # encode returns a list of tensors with the parameters above, but the type hint is too broad + embedding_tensor = embedding_tensor.coalesce() # type: ignore[union-attr] + indices = embedding_tensor.indices()[0].tolist() # Only column indices + values = embedding_tensor.values().tolist() + sparse_embeddings.append(SparseEmbedding(indices=indices, values=values)) return sparse_embeddings diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index ea0af6e0c8..2ff1671cbb 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -2,7 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Literal, Optional +from dataclasses import replace +from typing import Any, Literal, Optional from haystack import Document, component, default_from_dict, default_to_dict from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( @@ -224,7 +225,8 @@ def run(self, documents: list[Document]): data=texts_to_embed, batch_size=self.batch_size, show_progress_bar=self.progress_bar ) + documents_with_embeddings = [] for doc, emb in zip(documents, embeddings): - doc.sparse_embedding = emb + documents_with_embeddings.append(replace(doc, sparse_embedding=emb)) - return {"documents": documents} + return {"documents": documents_with_embeddings} diff --git a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py index 05f6921dc3..5c26974149 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py @@ -74,8 +74,6 @@ def test_sparse_embedding_function(mock_sparse_encoder): # Ensure the factory cache is cleared before each test. _SentenceTransformersSparseEmbeddingBackendFactory._instances = {} - # size = (5,) - tensors = [ torch.sparse_coo_tensor(torch.tensor([[1, 4]]), torch.tensor([0.5, 0.8])), torch.sparse_coo_tensor(torch.tensor([[2]]), torch.tensor([0.3])), From d95b0e993d3d094fc62ad0099bee6ad2d75be26c Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 12:07:49 +0200 Subject: [PATCH 15/20] token fixes --- .../embedders/backends/sentence_transformers_backend.py | 2 +- .../backends/sentence_transformers_sparse_backend.py | 2 +- .../embedders/sentence_transformers_sparse_text_embedder.py | 2 +- haystack/components/evaluators/sas_evaluator.py | 2 +- .../components/rankers/sentence_transformers_diversity.py | 2 +- .../components/rankers/sentence_transformers_similarity.py | 2 +- .../test_sentence_transformers_sparse_document_embedder.py | 5 ++++- .../test_sentence_transformers_sparse_text_embedder.py | 5 ++++- 8 files changed, 14 insertions(+), 8 deletions(-) diff --git a/haystack/components/embedders/backends/sentence_transformers_backend.py b/haystack/components/embedders/backends/sentence_transformers_backend.py index b06de0794b..5cb0a54729 100644 --- a/haystack/components/embedders/backends/sentence_transformers_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_backend.py @@ -8,7 +8,7 @@ from haystack.utils.auth import Secret with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as sentence_transformers_import: - from sentence_transformers import SentenceTransformer, SparseEncoder + from sentence_transformers import SentenceTransformer with LazyImport(message="Run 'pip install \"pillow\"'") as pillow_import: from PIL.Image import Image diff --git a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py index c44fb5addc..30300dd4cd 100644 --- a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py @@ -9,7 +9,7 @@ from haystack.utils.auth import Secret with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as sentence_transformers_import: - from sentence_transformers import SentenceTransformer, SparseEncoder + from sentence_transformers import SparseEncoder class _SentenceTransformersSparseEmbeddingBackendFactory: diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 600ebe77bd..6273e37f1b 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Literal, Optional +from typing import Any, Literal, Optional from haystack import component, default_from_dict, default_to_dict from haystack.components.embedders.backends.sentence_transformers_sparse_backend import ( diff --git a/haystack/components/evaluators/sas_evaluator.py b/haystack/components/evaluators/sas_evaluator.py index 49d1a23388..b2d7d91dbf 100644 --- a/haystack/components/evaluators/sas_evaluator.py +++ b/haystack/components/evaluators/sas_evaluator.py @@ -11,7 +11,7 @@ from haystack.utils import ComponentDevice, expit from haystack.utils.auth import Secret, deserialize_secrets_inplace -with LazyImport(message="Run 'pip install \"sentence-transformers>=4.1.0\"'") as sas_import: +with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as sas_import: from sentence_transformers import CrossEncoder, SentenceTransformer, util from transformers import AutoConfig diff --git a/haystack/components/rankers/sentence_transformers_diversity.py b/haystack/components/rankers/sentence_transformers_diversity.py index 24f848c640..ee77469f00 100644 --- a/haystack/components/rankers/sentence_transformers_diversity.py +++ b/haystack/components/rankers/sentence_transformers_diversity.py @@ -10,7 +10,7 @@ from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs -with LazyImport(message="Run 'pip install \"sentence-transformers>=4.1.0\"'") as torch_and_sentence_transformers_import: +with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as torch_and_sentence_transformers_import: import torch from sentence_transformers import SentenceTransformer diff --git a/haystack/components/rankers/sentence_transformers_similarity.py b/haystack/components/rankers/sentence_transformers_similarity.py index 4617ffcf2c..c0657f469c 100644 --- a/haystack/components/rankers/sentence_transformers_similarity.py +++ b/haystack/components/rankers/sentence_transformers_similarity.py @@ -11,7 +11,7 @@ from haystack.utils import ComponentDevice, Secret, deserialize_secrets_inplace from haystack.utils.hf import deserialize_hf_model_kwargs, serialize_hf_model_kwargs -with LazyImport(message="Run 'pip install \"sentence-transformers>=4.1.0\"'") as torch_and_sentence_transformers_import: +with LazyImport(message="Run 'pip install \"sentence-transformers>=5.0.0\"'") as torch_and_sentence_transformers_import: from sentence_transformers import CrossEncoder from torch.nn import Identity, Sigmoid diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index f246ff22d5..b15dd4055d 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -407,7 +407,10 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): @pytest.mark.integration @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run_sparse_document_embedder(self): + def test_live_run_sparse_document_embedder(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 + monkeypatch.delenv("HF_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 + docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index be93423303..7316a413a0 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -340,7 +340,10 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): @pytest.mark.integration @pytest.mark.slow @pytest.mark.flaky(reruns=3, reruns_delay=10) - def test_live_run_sparse_text_embedder(self): + def test_live_run_sparse_text_embedder(self, monkeypatch): + monkeypatch.delenv("HF_API_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 + monkeypatch.delenv("HF_TOKEN", raising=False) # https://github.com/deepset-ai/haystack/issues/8811 + text = "I love Nine Inch Nails" embedder = SentenceTransformersSparseTextEmbedder( model="naver/splade-cocondenser-ensembledistil", device=ComponentDevice.from_str("cpu") From 527e24baf6e328f63682ba09de1146569e0b1069 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 13:20:19 +0200 Subject: [PATCH 16/20] pydocs, small model in test, cache improvement --- docs/pydoc/config/embedders_api.yml | 2 + .../pydoc/config_docusaurus/embedders_api.yml | 2 + .../sentence_transformers_sparse_backend.py | 19 ++++++-- ...e_transformers_sparse_document_embedder.py | 21 +++++---- ...tence_transformers_sparse_text_embedder.py | 43 +++++-------------- ...e_transformers_sparse_document_embedder.py | 16 +++---- ...e_transformers_sparse_embedding_backend.py | 4 ++ ...tence_transformers_sparse_text_embedder.py | 40 ++++------------- 8 files changed, 60 insertions(+), 87 deletions(-) diff --git a/docs/pydoc/config/embedders_api.yml b/docs/pydoc/config/embedders_api.yml index b96dfaccde..415218f124 100644 --- a/docs/pydoc/config/embedders_api.yml +++ b/docs/pydoc/config/embedders_api.yml @@ -11,6 +11,8 @@ loaders: "openai_text_embedder", "sentence_transformers_document_embedder", "sentence_transformers_text_embedder", + "sentence_transformers_sparse_document_embedder", + "sentence_transformers_sparse_text_embedder", "image/sentence_transformers_doc_image_embedder", ] ignore_when_discovered: ["__init__"] diff --git a/docs/pydoc/config_docusaurus/embedders_api.yml b/docs/pydoc/config_docusaurus/embedders_api.yml index 5f86541804..c613be326f 100644 --- a/docs/pydoc/config_docusaurus/embedders_api.yml +++ b/docs/pydoc/config_docusaurus/embedders_api.yml @@ -11,6 +11,8 @@ loaders: "openai_text_embedder", "sentence_transformers_document_embedder", "sentence_transformers_text_embedder", + "sentence_transformers_sparse_document_embedder", + "sentence_transformers_sparse_text_embedder", "image/sentence_transformers_doc_image_embedder", ] ignore_when_discovered: ["__init__"] diff --git a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py index 30300dd4cd..c2d992e766 100644 --- a/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py +++ b/haystack/components/embedders/backends/sentence_transformers_sparse_backend.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import json from typing import Any, Literal, Optional from haystack.dataclasses.sparse_embedding import SparseEmbedding @@ -20,7 +21,7 @@ class _SentenceTransformersSparseEmbeddingBackendFactory: _instances: dict[str, "_SentenceTransformersSparseEncoderEmbeddingBackend"] = {} @staticmethod - def get_embedding_backend( # pylint: disable=too-many-positional-arguments + def get_embedding_backend( *, model: str, device: Optional[str] = None, @@ -32,7 +33,19 @@ def get_embedding_backend( # pylint: disable=too-many-positional-arguments config_kwargs: Optional[dict[str, Any]] = None, backend: Literal["torch", "onnx", "openvino"] = "torch", ): - embedding_backend_id = f"{model}{device}{auth_token}{backend}" + cache_params = { + "model": model, + "device": device, + "auth_token": auth_token, + "trust_remote_code": trust_remote_code, + "local_files_only": local_files_only, + "model_kwargs": model_kwargs, + "tokenizer_kwargs": tokenizer_kwargs, + "config_kwargs": config_kwargs, + "backend": backend, + } + + embedding_backend_id = json.dumps(cache_params, sort_keys=True, default=str) if embedding_backend_id in _SentenceTransformersSparseEmbeddingBackendFactory._instances: return _SentenceTransformersSparseEmbeddingBackendFactory._instances[embedding_backend_id] @@ -58,7 +71,7 @@ class _SentenceTransformersSparseEncoderEmbeddingBackend: Class to manage Sparse embeddings from Sentence Transformers. """ - def __init__( # pylint: disable=too-many-positional-arguments + def __init__( self, *, model: str, diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 2ff1671cbb..291a7c3114 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -17,9 +17,9 @@ @component class SentenceTransformersSparseDocumentEmbedder: """ - Calculates document embeddings using sprase embedding models from Sentence Transformers. + Calculates document sparse embeddings using sparse embedding models from Sentence Transformers. - It stores the embeddings in the `embedding` metadata field of each document. + It stores the sparse embeddings in the `sparse_embedding` metadata field of each document. You can also embed documents' metadata. Use this component in indexing pipelines to embed input documents and send them to DocumentWriter to write a into a Document Store. @@ -34,15 +34,16 @@ class SentenceTransformersSparseDocumentEmbedder: doc_embedder.warm_up() result = doc_embedder.run([doc]) - print(result['documents'][0].embedding) + print(result['documents'][0].sparse_embedding) - # [-0.07804739475250244, 0.1498992145061493, ...] + # SparseEmbedding(indices=[999, 1045, ...], values=[0.918, 0.867, ...]) ``` """ - def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments + def __init__( # noqa: PLR0913 self, - model: str = "naver/splade-cocondenser-ensembledistil", + *, + model: str = "prithivida/Splade_PP_en_v2", device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), prefix: str = "", @@ -59,10 +60,10 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments backend: Literal["torch", "onnx", "openvino"] = "torch", ): """ - Creates a SentenceTransformersDocumentEmbedder component. + Creates a SentenceTransformersSparseDocumentEmbedder component. :param model: - The model to use for calculating embeddings. + The model to use for calculating sparse embeddings. Pass a local path or ID of the model on Hugging Face. :param device: The device to use for loading the model. @@ -71,8 +72,6 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments The API token to download private models from Hugging Face. :param prefix: A string to add at the beginning of each document text. - Can be used to prepend the text with an instruction, as required by some embedding models, - such as E5 and bge. :param suffix: A string to add at the end of each document text. :param batch_size: @@ -201,7 +200,7 @@ def run(self, documents: list[Document]): :returns: A dictionary with the following keys: - - `documents`: Documents with embeddings. + - `documents`: Documents with sparse embeddings under the `sparse_embedding` field. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): raise TypeError( diff --git a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py index 6273e37f1b..6683d055ba 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_text_embedder.py @@ -17,9 +17,9 @@ @component class SentenceTransformersSparseTextEmbedder: """ - Embeds strings using sprase embedding models from Sentence Transformers. + Embeds strings using sparse embedding models from Sentence Transformers. - You can use it to embed user query and send it to an embedding retriever. + You can use it to embed user query and send it to a sparse embedding retriever. Usage example: ```python @@ -32,19 +32,18 @@ class SentenceTransformersSparseTextEmbedder: print(text_embedder.run(text_to_embed)) - # {'embedding': [-0.07804739475250244, 0.1498992145061493,, ...]} + # {'sparse_embedding': SparseEmbedding(indices=[999, 1045, ...], values=[0.918, 0.867, ...])} ``` """ - def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments + def __init__( # noqa: PLR0913 self, - model: str = "naver/splade-cocondenser-ensembledistil", + *, + model: str = "prithivida/Splade_PP_en_v2", device: Optional[ComponentDevice] = None, token: Optional[Secret] = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), prefix: str = "", suffix: str = "", - batch_size: int = 32, - progress_bar: bool = True, trust_remote_code: bool = False, local_files_only: bool = False, model_kwargs: Optional[dict[str, Any]] = None, @@ -57,7 +56,7 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments Create a SentenceTransformersSparseTextEmbedder component. :param model: - The model to use for calculating embeddings. + The model to use for calculating sparse embeddings. Specify the path to a local model or the ID of the model on Hugging Face. :param device: Overrides the default device used to load the model. @@ -65,15 +64,8 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments An API token to use private models from Hugging Face. :param prefix: A string to add at the beginning of each text to be embedded. - You can use it to prepend the text with an instruction, as required by some embedding models, - such as E5 and bge. :param suffix: A string to add at the end of each text to embed. - :param batch_size: - Number of texts to embed at once. - :param progress_bar: - If `True`, shows a progress bar for calculating embeddings. - If `False`, disables the progress bar. :param trust_remote_code: If `False`, permits only Hugging Face verified model architectures. If `True`, permits custom models and scripts. @@ -87,10 +79,6 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments Refer to specific model documentation for available kwargs. :param config_kwargs: Additional keyword arguments for `AutoConfig.from_pretrained` when loading the model configuration. - :param encode_kwargs: - Additional keyword arguments for `SentenceTransformer.encode` when embedding texts. - This parameter is provided for fine customization. Be careful not to clash with already set parameters and - avoid passing parameters that change the output type. :param backend: The backend to use for the Sentence Transformers model. Choose from "torch", "onnx", or "openvino". Refer to the [Sentence Transformers documentation](https://sbert.net/docs/sentence_transformer/usage/efficiency.html) @@ -102,14 +90,11 @@ def __init__( # noqa: PLR0913 # pylint: disable=too-many-positional-arguments self.token = token self.prefix = prefix self.suffix = suffix - self.batch_size = batch_size - self.progress_bar = progress_bar self.trust_remote_code = trust_remote_code self.local_files_only = local_files_only self.model_kwargs = model_kwargs self.tokenizer_kwargs = tokenizer_kwargs self.config_kwargs = config_kwargs - self.encode_kwargs = encode_kwargs self.embedding_backend: Optional[_SentenceTransformersSparseEncoderEmbeddingBackend] = None self.backend = backend @@ -133,14 +118,11 @@ def to_dict(self) -> dict[str, Any]: token=self.token.to_dict() if self.token else None, prefix=self.prefix, suffix=self.suffix, - batch_size=self.batch_size, - progress_bar=self.progress_bar, trust_remote_code=self.trust_remote_code, local_files_only=self.local_files_only, model_kwargs=self.model_kwargs, tokenizer_kwargs=self.tokenizer_kwargs, config_kwargs=self.config_kwargs, - encode_kwargs=self.encode_kwargs, backend=self.backend, ) if serialization_dict["init_parameters"].get("model_kwargs") is not None: @@ -194,7 +176,7 @@ def run(self, text: str): :returns: A dictionary with the following keys: - - `embedding`: The embedding of the input text. + - `sparse_embedding`: The sparse embedding of the input text. """ if not isinstance(text, str): raise TypeError( @@ -207,11 +189,6 @@ def run(self, text: str): text_to_embed = self.prefix + text + self.suffix - embedding = self.embedding_backend.embed( - data=[text_to_embed], - batch_size=self.batch_size, - show_progress_bar=self.progress_bar, - **(self.encode_kwargs if self.encode_kwargs else {}), - )[0] + sparse_embedding = self.embedding_backend.embed(data=[text_to_embed])[0] - return {"sparse_embedding": embedding} + return {"sparse_embedding": sparse_embedding} diff --git a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py index b15dd4055d..3ae63b7786 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_document_embedder.py @@ -162,7 +162,7 @@ def test_from_dict(self): def test_from_dict_no_default_parameters(self): component = SentenceTransformersSparseDocumentEmbedder.from_dict({"type": TYPE_NAME, "init_parameters": {}}) - assert component.model == "naver/splade-cocondenser-ensembledistil" + assert component.model == "prithivida/Splade_PP_en_v2" assert component.device == ComponentDevice.resolve_device(None) assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) assert component.prefix == "" @@ -328,7 +328,7 @@ def test_prefix_suffix(self): ) def test_model_onnx_backend(self, mocked_factory): onnx_embedder = SentenceTransformersSparseDocumentEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cpu"), model_kwargs={ @@ -340,7 +340,7 @@ def test_model_onnx_backend(self, mocked_factory): onnx_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cpu", auth_token=None, trust_remote_code=False, @@ -356,7 +356,7 @@ def test_model_onnx_backend(self, mocked_factory): ) def test_model_openvino_backend(self, mocked_factory): openvino_embedder = SentenceTransformersSparseDocumentEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cpu"), model_kwargs={ @@ -368,7 +368,7 @@ def test_model_openvino_backend(self, mocked_factory): openvino_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cpu", auth_token=None, trust_remote_code=False, @@ -385,7 +385,7 @@ def test_model_openvino_backend(self, mocked_factory): @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) def test_dtype_on_gpu(self, mocked_factory, model_kwargs): torch_dtype_embedder = SentenceTransformersSparseDocumentEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cuda:0"), model_kwargs=model_kwargs, @@ -393,7 +393,7 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): torch_dtype_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cuda:0", auth_token=None, trust_remote_code=False, @@ -417,7 +417,7 @@ def test_live_run_sparse_document_embedder(self, monkeypatch): ] embedder = SentenceTransformersSparseDocumentEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="sparse-encoder-testing/splade-bert-tiny-nq", meta_fields_to_embed=["topic"], embedding_separator=" | ", device=ComponentDevice.from_str("cpu"), diff --git a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py index 5c26974149..4079c69f8a 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py +++ b/test/components/embedders/test_sentence_transformers_sparse_embedding_backend.py @@ -25,9 +25,13 @@ def test_sparse_factory_behavior(mock_sparse_encoder): another_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( model="another_model", device="cpu" ) + yet_another_embedding_backend = _SentenceTransformersSparseEmbeddingBackendFactory.get_embedding_backend( + model="my_model", device="cpu", trust_remote_code=True + ) assert same_embedding_backend is embedding_backend assert another_embedding_backend is not embedding_backend + assert yet_another_embedding_backend is not embedding_backend @patch("haystack.components.embedders.backends.sentence_transformers_sparse_backend.SparseEncoder") diff --git a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py index 7316a413a0..b6246e925e 100644 --- a/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py +++ b/test/components/embedders/test_sentence_transformers_sparse_text_embedder.py @@ -26,8 +26,6 @@ def test_init_default(self): assert embedder.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) assert embedder.prefix == "" assert embedder.suffix == "" - assert embedder.batch_size == 32 - assert embedder.progress_bar is True assert embedder.trust_remote_code is False assert embedder.local_files_only is False @@ -38,8 +36,6 @@ def test_init_with_parameters(self): token=Secret.from_token("fake-api-token"), prefix="prefix", suffix="suffix", - batch_size=64, - progress_bar=False, trust_remote_code=True, local_files_only=True, ) @@ -48,8 +44,6 @@ def test_init_with_parameters(self): assert embedder.token == Secret.from_token("fake-api-token") assert embedder.prefix == "prefix" assert embedder.suffix == "suffix" - assert embedder.batch_size == 64 - assert embedder.progress_bar is False assert embedder.trust_remote_code is True assert embedder.local_files_only is True @@ -64,13 +58,10 @@ def test_to_dict(self): "device": ComponentDevice.from_str("cpu").to_dict(), "prefix": "", "suffix": "", - "batch_size": 32, - "progress_bar": True, "trust_remote_code": False, "local_files_only": False, "model_kwargs": None, "tokenizer_kwargs": None, - "encode_kwargs": None, "config_kwargs": None, "backend": "torch", }, @@ -83,8 +74,6 @@ def test_to_dict_with_custom_init_parameters(self): token=Secret.from_env_var("ENV_VAR", strict=False), prefix="prefix", suffix="suffix", - batch_size=64, - progress_bar=False, trust_remote_code=True, local_files_only=True, model_kwargs={"torch_dtype": torch.float32}, @@ -101,14 +90,11 @@ def test_to_dict_with_custom_init_parameters(self): "device": ComponentDevice.from_str("cuda:0").to_dict(), "prefix": "prefix", "suffix": "suffix", - "batch_size": 64, - "progress_bar": False, "trust_remote_code": True, "local_files_only": True, "model_kwargs": {"torch_dtype": "torch.float32"}, "tokenizer_kwargs": {"model_max_length": 512}, "config_kwargs": {"use_memory_efficient_attention": False}, - "encode_kwargs": {"task": "clustering"}, "backend": "torch", }, } @@ -127,8 +113,6 @@ def test_from_dict(self): "device": ComponentDevice.from_str("cpu").to_dict(), "prefix": "", "suffix": "", - "batch_size": 32, - "progress_bar": True, "trust_remote_code": False, "local_files_only": False, "model_kwargs": {"torch_dtype": "torch.float32"}, @@ -142,8 +126,6 @@ def test_from_dict(self): assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) assert component.prefix == "" assert component.suffix == "" - assert component.batch_size == 32 - assert component.progress_bar is True assert component.trust_remote_code is False assert component.local_files_only is False assert component.model_kwargs == {"torch_dtype": torch.float32} @@ -153,13 +135,11 @@ def test_from_dict(self): def test_from_dict_no_default_parameters(self): data = {"type": TYPE_NAME, "init_parameters": {}} component = SentenceTransformersSparseTextEmbedder.from_dict(data) - assert component.model == "naver/splade-cocondenser-ensembledistil" + assert component.model == "prithivida/Splade_PP_en_v2" assert component.device == ComponentDevice.resolve_device(None) assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) assert component.prefix == "" assert component.suffix == "" - assert component.batch_size == 32 - assert component.progress_bar is True assert component.trust_remote_code is False assert component.local_files_only is False @@ -172,8 +152,6 @@ def test_from_dict_none_device(self): "device": None, "prefix": "", "suffix": "", - "batch_size": 32, - "progress_bar": True, "trust_remote_code": False, "local_files_only": False, }, @@ -184,8 +162,6 @@ def test_from_dict_none_device(self): assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) assert component.prefix == "" assert component.suffix == "" - assert component.batch_size == 32 - assert component.progress_bar is True assert component.trust_remote_code is False assert component.local_files_only is False @@ -261,7 +237,7 @@ def test_run_wrong_input_format(self): ) def test_model_onnx_backend(self, mocked_factory): onnx_embedder = SentenceTransformersSparseTextEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cpu"), model_kwargs={ @@ -273,7 +249,7 @@ def test_model_onnx_backend(self, mocked_factory): onnx_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cpu", auth_token=None, trust_remote_code=False, @@ -289,7 +265,7 @@ def test_model_onnx_backend(self, mocked_factory): ) def test_model_openvino_backend(self, mocked_factory): openvino_embedder = SentenceTransformersSparseTextEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cpu"), model_kwargs={ @@ -301,7 +277,7 @@ def test_model_openvino_backend(self, mocked_factory): openvino_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cpu", auth_token=None, trust_remote_code=False, @@ -318,7 +294,7 @@ def test_model_openvino_backend(self, mocked_factory): @pytest.mark.parametrize("model_kwargs", [{"torch_dtype": "bfloat16"}, {"torch_dtype": "float16"}]) def test_dtype_on_gpu(self, mocked_factory, model_kwargs): torch_dtype_embedder = SentenceTransformersSparseTextEmbedder( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", token=None, device=ComponentDevice.from_str("cuda:0"), model_kwargs=model_kwargs, @@ -326,7 +302,7 @@ def test_dtype_on_gpu(self, mocked_factory, model_kwargs): torch_dtype_embedder.warm_up() mocked_factory.get_embedding_backend.assert_called_once_with( - model="naver/splade-cocondenser-ensembledistil", + model="prithivida/Splade_PP_en_v2", device="cuda:0", auth_token=None, trust_remote_code=False, @@ -346,7 +322,7 @@ def test_live_run_sparse_text_embedder(self, monkeypatch): text = "I love Nine Inch Nails" embedder = SentenceTransformersSparseTextEmbedder( - model="naver/splade-cocondenser-ensembledistil", device=ComponentDevice.from_str("cpu") + model="sparse-encoder-testing/splade-bert-tiny-nq", device=ComponentDevice.from_str("cpu") ) embedder.warm_up() result = embedder.run(text=text) From 06e8d8bbd32e8f66f0beeacc7fba12a9df614c17 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 14:14:59 +0200 Subject: [PATCH 17/20] try 3.9 for docs --- .github/workflows/readme_sync.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/readme_sync.yml b/.github/workflows/readme_sync.yml index dd65da84dc..3f8a1949dc 100644 --- a/.github/workflows/readme_sync.yml +++ b/.github/workflows/readme_sync.yml @@ -14,7 +14,7 @@ on: env: HATCH_VERSION: "1.14.1" - PYTHON_VERSION: "3.10" + PYTHON_VERSION: "3.9" jobs: sync: From 044652dde4716e27422d13d9af188b257f8c9e0e Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 14:19:15 +0200 Subject: [PATCH 18/20] better to pin click --- .github/workflows/readme_sync.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/readme_sync.yml b/.github/workflows/readme_sync.yml index 3f8a1949dc..5a524b1eef 100644 --- a/.github/workflows/readme_sync.yml +++ b/.github/workflows/readme_sync.yml @@ -14,7 +14,7 @@ on: env: HATCH_VERSION: "1.14.1" - PYTHON_VERSION: "3.9" + PYTHON_VERSION: "3.10" jobs: sync: @@ -29,7 +29,7 @@ jobs: python-version: "${{ env.PYTHON_VERSION }}" - name: Install Hatch - run: pip install hatch==${{ env.HATCH_VERSION }} + run: pip install "hatch==${{ env.HATCH_VERSION }}" "click<8.3.0" - name: Generate API docs env: From cfc7dda7bf40c769bc87c5e2ea0d4dfd1bf033e3 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 15:29:26 +0200 Subject: [PATCH 19/20] release note --- .../st-sparse-embedders-497f45db848c89eb.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 releasenotes/notes/st-sparse-embedders-497f45db848c89eb.yaml diff --git a/releasenotes/notes/st-sparse-embedders-497f45db848c89eb.yaml b/releasenotes/notes/st-sparse-embedders-497f45db848c89eb.yaml new file mode 100644 index 0000000000..fef684a82f --- /dev/null +++ b/releasenotes/notes/st-sparse-embedders-497f45db848c89eb.yaml @@ -0,0 +1,21 @@ +--- +features: + - | + Introduce `SentenceTransformersSparseTextEmbedder` and `SentenceTransformersSparseDocumentEmbedder` components. + These components embed text and documents using sparse embedding models compatible with + Sentence Transformers. + Sparse embeddings are interpretable, efficient when used with inverted indexes, combine classic information + retrieval with neural models, and are complementary to dense embeddings. + Currently, the produced `SparseEmbedding` objects are compatible with the `QdrantDocumentStore`. + + Usage example: + ```python + from haystack.components.embedders import SentenceTransformersSparseTextEmbedder + ``` + text_embedder = SentenceTransformersSparseTextEmbedder() + text_embedder.warm_up() + + print(text_embedder.run("I love pizza!")) + + # {'sparse_embedding': SparseEmbedding(indices=[999, 1045, ...], values=[0.918, 0.867, ...])} + ``` From 4e7850c170aded1d97367e9f8b252a396971f9e0 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 19 Sep 2025 15:50:33 +0200 Subject: [PATCH 20/20] small fix --- .../embedders/sentence_transformers_sparse_document_embedder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py index 291a7c3114..3ac732ccc4 100644 --- a/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py +++ b/haystack/components/embedders/sentence_transformers_sparse_document_embedder.py @@ -29,6 +29,7 @@ class SentenceTransformersSparseDocumentEmbedder: ```python from haystack import Document from haystack.components.embedders import SentenceTransformersSparseDocumentEmbedder + doc = Document(content="I love pizza!") doc_embedder = SentenceTransformersSparseDocumentEmbedder() doc_embedder.warm_up()