-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathtypes.py
More file actions
173 lines (146 loc) · 5.2 KB
/
types.py
File metadata and controls
173 lines (146 loc) · 5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Core types for RAG pipelines.
This module contains the core dataclasses used throughout the RAG pipeline
implementation. The primary type is EmbeddableItem, which represents any
content that can be embedded and stored in a vector database.
Types:
- Content: Container for embeddable content
- Embedding: Vector embedding with optional metadata
- EmbeddableItem: Universal container for embeddable content
- Chunk: Alias for EmbeddableItem (backward compatibility)
"""
import uuid
from dataclasses import dataclass
from dataclasses import field
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
@dataclass
class Content:
"""Container for embeddable content.
Args:
text: Text content to be embedded.
image: Image as bytes or path/URI
(e.g., 'gs://bucket/img.jpg').
"""
text: Optional[str] = None
image: Optional[Union[bytes, str]] = None
@dataclass
class Embedding:
"""Represents vector embeddings with optional metadata.
Args:
dense_embedding: Dense vector representation.
sparse_embedding: Optional sparse vector representation for hybrid search.
"""
dense_embedding: Optional[List[float]] = None
sparse_embedding: Optional[Tuple[List[int], List[float]]] = None
@dataclass
class EmbeddableItem:
"""Universal container for embeddable content.
Represents any content that can be embedded and stored in a vector database.
Use factory methods for convenient construction, or construct directly with
a Content object.
Examples:
Text (via factory):
item = EmbeddableItem.from_text(
"hello world", metadata={'src': 'doc'})
Image (via factory):
item = EmbeddableItem.from_image(
'gs://bucket/img.jpg')
Text (direct, equivalent to old Chunk usage):
item = EmbeddableItem(
content=Content(text="hello"), index=3)
Args:
content: The content to embed.
id: Unique identifier.
index: Position within source document (for chunking use cases).
metadata: Additional metadata (e.g., document source, language).
embedding: Embedding populated by the embedding step.
"""
content: Content
id: str = field(default_factory=lambda: str(uuid.uuid4()))
index: int = 0
metadata: Dict[str, Any] = field(default_factory=dict)
embedding: Optional[Embedding] = None
@classmethod
def from_text(
cls,
text: str,
*,
id: Optional[str] = None,
index: int = 0,
metadata: Optional[Dict[str, Any]] = None,
) -> 'EmbeddableItem':
"""Create an EmbeddableItem with text content.
Args:
text: The text content to embed
id: Unique identifier (auto-generated if not provided)
index: Position within source document (for chunking)
metadata: Additional metadata
"""
return cls(
content=Content(text=text),
id=id or str(uuid.uuid4()),
index=index,
metadata=metadata or {},
)
@classmethod
def from_image(
cls,
image: Union[bytes, str],
*,
id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> 'EmbeddableItem':
"""Create an EmbeddableItem with image content.
Args:
image: Image bytes or path/URI (e.g. GCS path)
id: Unique identifier (auto-generated if not
provided)
metadata: Additional metadata
"""
return cls(
content=Content(image=image),
id=id or str(uuid.uuid4()),
metadata=metadata or {},
)
@property
def dense_embedding(self) -> Optional[List[float]]:
return self.embedding.dense_embedding if self.embedding else None
@property
def sparse_embedding(self) -> Optional[Tuple[List[int], List[float]]]:
return self.embedding.sparse_embedding if self.embedding else None
@property
def content_string(self) -> str:
"""Returns storable string content for ingestion.
Falls back through content fields in priority order:
text > image URI.
"""
if self.content.text is not None:
return self.content.text
if isinstance(self.content.image, str):
return self.content.image
raise ValueError(
f'EmbeddableItem does not contain storable string content'
f' (text or image URI). {self}')
# Backward compatibility alias. Existing code using Chunk continues to work
# unchanged since Chunk IS EmbeddableItem.
Chunk = EmbeddableItem