-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Expand file tree
/
Copy pathtest_utils.py
More file actions
447 lines (383 loc) · 16.5 KB
/
test_utils.py
File metadata and controls
447 lines (383 loc) · 16.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import contextlib
import logging
import os
import socket
import tempfile
import unittest
from dataclasses import dataclass
from typing import Callable
from typing import List
from typing import Optional
from typing import cast
from apache_beam.ml.rag.types import Chunk
from apache_beam.ml.rag.utils import retry_with_backoff
# pylint: disable=ungrouped-imports
try:
import yaml
from pymilvus import CollectionSchema
from pymilvus import FieldSchema
from pymilvus import MilvusClient
from pymilvus.exceptions import MilvusException
from pymilvus.milvus_client import IndexParams
from testcontainers.core.config import testcontainers_config
from testcontainers.core.generic import DbContainer
from testcontainers.milvus import MilvusContainer
from apache_beam.ml.rag.enrichment.milvus_search import MilvusConnectionParameters
except ImportError as e:
raise unittest.SkipTest(f'RAG test util dependencies not installed: {str(e)}')
_LOGGER = logging.getLogger(__name__)
@dataclass
class VectorDBContainerInfo:
"""Container information for vector database test instances.
Holds connection details and container reference for testing with
vector databases like Milvus in containerized environments.
"""
container: DbContainer
host: str
port: int
user: str = ""
password: str = ""
token: str = ""
id: str = "default"
@property
def uri(self) -> str:
return f"http://{self.host}:{self.port}"
class TestHelpers:
@staticmethod
def find_free_port():
"""Find a free port on the local machine."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
# Bind to port 0, which asks OS to assign a free port.
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
# Return the port number assigned by OS.
return s.getsockname()[1]
class CustomMilvusContainer(MilvusContainer):
"""Custom Milvus container with configurable ports and environment setup.
Extends MilvusContainer to provide custom port binding and environment
configuration for testing with standalone Milvus instances.
"""
def __init__( # pylint: disable=bad-super-call
self,
image: str,
service_container_port,
healthcheck_container_port,
**kwargs,
) -> None:
# Skip the parent class's constructor and go straight to
# GenericContainer.
super(
MilvusContainer,
self,
).__init__(
image=image, **kwargs)
self.port = service_container_port
self.healthcheck_port = healthcheck_container_port
self.with_exposed_ports(service_container_port, healthcheck_container_port)
# Get free host ports.
service_host_port = TestHelpers.find_free_port()
healthcheck_host_port = TestHelpers.find_free_port()
# Bind container and host ports.
self.with_bind_ports(service_container_port, service_host_port)
self.with_bind_ports(healthcheck_container_port, healthcheck_host_port)
self.cmd = "milvus run standalone"
# Set environment variables needed for Milvus.
envs = {
"ETCD_USE_EMBED": "true",
"ETCD_DATA_DIR": "/var/lib/milvus/etcd",
"COMMON_STORAGETYPE": "local",
"METRICS_PORT": str(healthcheck_container_port)
}
for env, value in envs.items():
self.with_env(env, value)
class MilvusTestHelpers:
"""Helper utilities for testing Milvus vector database operations.
Provides static methods for managing test containers, configuration files,
and chunk comparison utilities for Milvus-based integration tests.
"""
# IMPORTANT: When upgrading the Milvus server version, ensure the pymilvus
# Python SDK client in setup.py is updated to match. Referring to the Milvus
# release notes compatibility matrix at
# https://milvus.io/docs/release_notes.md or PyPI at
# https://pypi.org/project/pymilvus/ for version compatibility.
# Example: Milvus v2.6.0 requires pymilvus==2.6.0 (exact match required).
@staticmethod
def _wait_for_milvus_grpc(uri: str) -> None:
"""Wait until Milvus accepts RPCs.
Docker may report started before gRPC is ready.
"""
def list_collections_probe():
client = MilvusClient(uri=uri)
try:
client.list_collections()
finally:
client.close()
retry_with_backoff(
list_collections_probe,
max_retries=25,
retry_delay=2.0,
retry_backoff_factor=1.2,
operation_name="Milvus client connection after container start",
exception_types=(MilvusException, ))
@staticmethod
def start_db_container(
image="milvusdb/milvus:v2.5.10",
max_vec_fields=5,
vector_client_max_retries=3,
tc_max_retries=None) -> Optional[VectorDBContainerInfo]:
service_container_port = TestHelpers.find_free_port()
healthcheck_container_port = TestHelpers.find_free_port()
user_yaml_creator = MilvusTestHelpers.create_user_yaml
with user_yaml_creator(service_container_port, max_vec_fields) as cfg:
info = None
original_tc_max_tries = testcontainers_config.max_tries
if tc_max_retries is not None:
testcontainers_config.max_tries = tc_max_retries
for i in range(vector_client_max_retries):
vector_db_container: Optional[CustomMilvusContainer] = None
try:
vector_db_container = CustomMilvusContainer(
image=image,
service_container_port=service_container_port,
healthcheck_container_port=healthcheck_container_port)
mapped_container = vector_db_container.with_volume_mapping(
cfg, "/milvus/configs/user.yaml")
assert mapped_container is not None
running_container: CustomMilvusContainer = mapped_container
vector_db_container = running_container
running_container.start()
host = running_container.get_container_host_ip()
port = running_container.get_exposed_port(service_container_port)
info = VectorDBContainerInfo(running_container, host, port)
MilvusTestHelpers._wait_for_milvus_grpc(info.uri)
_LOGGER.info(
"milvus db container started successfully on %s.", info.uri)
break
except Exception as e:
stdout_logs = stderr_logs = ""
if vector_db_container is not None:
raw_out, raw_err = vector_db_container.get_logs()
stdout_logs = raw_out.decode("utf-8")
stderr_logs = raw_err.decode("utf-8")
_LOGGER.warning(
"Retry %d/%d: Failed to start Milvus DB container. Reason: %s. "
"STDOUT logs:\n%s\nSTDERR logs:\n%s",
i + 1,
vector_client_max_retries,
e,
stdout_logs,
stderr_logs)
if i == vector_client_max_retries - 1:
_LOGGER.error(
"Unable to start milvus db container for I/O tests after %d "
"retries. Tests cannot proceed. STDOUT logs:\n%s\n"
"STDERR logs:\n%s",
vector_client_max_retries,
stdout_logs,
stderr_logs)
raise e
finally:
testcontainers_config.max_tries = original_tc_max_tries
return info
@staticmethod
def stop_db_container(db_info: VectorDBContainerInfo):
if db_info is None:
_LOGGER.warning("Milvus db info is None. Skipping stop operation.")
return
_LOGGER.debug("Stopping milvus db container.")
db_info.container.stop()
_LOGGER.info("milvus db container stopped successfully.")
@staticmethod
def initialize_db_with_data(
connc_params: MilvusConnectionParameters, config: dict):
# Open the connection to the milvus db with retry.
def create_client():
return MilvusClient(**connc_params.__dict__)
client = retry_with_backoff(
create_client,
max_retries=5,
retry_delay=2.0,
operation_name="Test Milvus client connection",
exception_types=(MilvusException, ))
# Configure schema.
field_schemas: List[FieldSchema] = cast(List[FieldSchema], config["fields"])
schema = CollectionSchema(
fields=field_schemas, functions=config["functions"])
# Create collection with the schema.
collection_name = config["collection_name"]
index_function: Callable[[], IndexParams] = cast(
Callable[[], IndexParams], config["index"])
client.create_collection(
collection_name=collection_name,
schema=schema,
index_params=index_function())
# Assert that collection was created.
collection_error = f"Expected collection '{collection_name}' to be created."
assert client.has_collection(collection_name), collection_error
# Gather all fields we have excluding 'sparse_embedding_bm25' special field.
fields = list(map(lambda field: field.name, field_schemas))
# Prep data for indexing. Currently we can't insert sparse vectors for BM25
# sparse embedding field as it would be automatically generated by Milvus
# through the registered BM25 function.
data_ready_to_index = []
for doc in config["corpus"]:
item = {}
for field in fields:
if field.startswith("dense_embedding"):
item[field] = doc["dense_embedding"]
elif field == "sparse_embedding_inner_product":
item[field] = doc["sparse_embedding"]
elif field == "sparse_embedding_bm25":
# It is automatically generated by Milvus from the content field.
continue
else:
item[field] = doc[field]
data_ready_to_index.append(item)
# Index data.
result = client.insert(
collection_name=collection_name, data=data_ready_to_index)
# Assert that the intended data has been properly indexed.
insertion_err = f'failed to insert the {result["insert_count"]} data points'
assert result["insert_count"] == len(data_ready_to_index), insertion_err
# Release the collection from memory. It will be loaded lazily when the
# enrichment handler is invoked.
client.release_collection(collection_name)
# Close the connection to the Milvus database, as no further preparation
# operations are needed before executing the enrichment handler.
client.close()
return collection_name
@staticmethod
@contextlib.contextmanager
def create_user_yaml(service_port: int, max_vector_field_num=5):
"""Creates a temporary user.yaml file for Milvus configuration.
This user yaml file overrides Milvus default configurations. It sets
the Milvus service port to the specified container service port. The
default for maxVectorFieldNum is 4, but we need 5
(one unique field for each metric).
Args:
service_port: Port number for the Milvus service.
max_vector_field_num: Max number of vec fields allowed per collection.
Yields:
str: Path to the created temporary yaml file.
"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
delete=False) as temp_file:
# Define the content for user.yaml.
user_config = {
'proxy': {
'maxVectorFieldNum': max_vector_field_num, 'port': service_port
},
'etcd': {
'use': {
'embed': True
}, 'data': {
'dir': '/var/lib/milvus/etcd'
}
}
}
# Write the content to the file.
yaml.dump(user_config, temp_file, default_flow_style=False)
path = temp_file.name
try:
yield path
finally:
if os.path.exists(path):
os.remove(path)
@staticmethod
def assert_chunks_equivalent(
actual_chunks: List[Chunk], expected_chunks: List[Chunk]):
"""assert_chunks_equivalent checks for presence rather than exact match"""
# Sort both lists by ID to ensure consistent ordering.
actual_sorted = sorted(actual_chunks, key=lambda c: c.id)
expected_sorted = sorted(expected_chunks, key=lambda c: c.id)
actual_len = len(actual_sorted)
expected_len = len(expected_sorted)
err_msg = (
f"Different number of chunks, actual: {actual_len}, "
f"expected: {expected_len}")
assert actual_len == expected_len, err_msg
for actual, expected in zip(actual_sorted, expected_sorted):
# Assert that IDs match.
assert actual.id == expected.id
# Assert that dense embeddings match.
err_msg = f"Dense embedding mismatch for chunk {actual.id}"
assert actual.dense_embedding == expected.dense_embedding, err_msg
# Assert that sparse embeddings match.
err_msg = f"Sparse embedding mismatch for chunk {actual.id}"
assert actual.sparse_embedding == expected.sparse_embedding, err_msg
# Assert that text content match.
err_msg = f"Text Content mismatch for chunk {actual.id}"
assert actual.content.text == expected.content.text, err_msg
# For enrichment_data, be more flexible.
# If "expected" has values for enrichment_data but actual doesn't, that's
# acceptable since vector search results can vary based on many factors
# including implementation details, vector database state, and slight
# variations in similarity calculations.
# First ensure the enrichment data key exists.
err_msg = f"Missing enrichment_data key in chunk {actual.id}"
assert 'enrichment_data' in actual.metadata, err_msg
# For enrichment_data, ensure consistent ordering of results.
actual_data = actual.metadata['enrichment_data']
expected_data = expected.metadata['enrichment_data']
# If actual has enrichment data, then perform detailed validation.
if actual_data:
# Ensure the id key exist.
err_msg = f"Missing id key in metadata {actual.id}"
assert 'id' in actual_data, err_msg
# Validate IDs have consistent ordering.
actual_ids = sorted(actual_data['id'])
expected_ids = sorted(expected_data['id'])
err_msg = f"IDs in enrichment_data don't match for chunk {actual.id}"
assert actual_ids == expected_ids, err_msg
# Ensure the distance key exist.
err_msg = f"Missing distance key in metadata {actual.id}"
assert 'distance' in actual_data, err_msg
# Validate distances exist and have same length as IDs.
actual_distances = actual_data['distance']
expected_distances = expected_data['distance']
err_msg = (
"Number of distances doesn't match number of IDs for "
f"chunk {actual.id}")
assert len(actual_distances) == len(expected_distances), err_msg
# Ensure the fields key exist.
err_msg = f"Missing fields key in metadata {actual.id}"
assert 'fields' in actual_data, err_msg
# Validate fields have consistent content.
# Sort fields by 'id' to ensure consistent ordering.
actual_fields_sorted = sorted(
actual_data['fields'], key=lambda f: f.get('id', 0))
expected_fields_sorted = sorted(
expected_data['fields'], key=lambda f: f.get('id', 0))
# Compare field IDs.
actual_field_ids = [f.get('id') for f in actual_fields_sorted]
expected_field_ids = [f.get('id') for f in expected_fields_sorted]
err_msg = f"Field IDs don't match for chunk {actual.id}"
assert actual_field_ids == expected_field_ids, err_msg
# Compare field content.
for a_f, e_f in zip(actual_fields_sorted, expected_fields_sorted):
# Ensure the id key exist.
err_msg = f"Missing id key in metadata.fields {actual.id}"
assert 'id' in a_f, err_msg
err_msg = f"Field ID mismatch chunk {actual.id}"
assert a_f['id'] == e_f['id'], err_msg
# Validate field metadata.
err_msg = f"Field Metadata doesn't match for chunk {actual.id}"
assert a_f['metadata'] == e_f['metadata'], err_msg
if __name__ == '__main__':
unittest.main()