Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6c2ba81

Browse files
chamikaramjdhalperi
authored andcommitted
Adds a test harnesses and utilities framework for sources.
Helper functions and test harnesses for checking correctness of source (``iobase.BoundedSource``) and range tracker (``iobase.RangeTracker``) implementations. Contains a few lightweight utilities (e.g. reading items from a source such as ``readFromSource()``, as well as heavyweight property testing and stress testing harnesses that help getting a large amount of test coverage with few code. Most notable ones are: * ``assertSourcesEqualReferenceSource()`` helps testing that the data read by the union of sources produced by ``BoundedSource.split()`` is the same as data read by the original source. * If your source implements dynamic work rebalancing, use the ``assertSplitAtFraction()`` family of functions - they test behavior of ``RangeTracker.try_split()``, in particular, that various consistency properties are respected and the total set of data read by the source is preserved when splits happen. Use ``assertSplitAtFractionBehavior()`` to test individual cases of ``splitAtFraction()`` and use ``assertSplitAtFractionExhaustive()`` as a heavy-weight stress test including concurrency.
1 parent 062af66 commit 6c2ba81

3 files changed

Lines changed: 710 additions & 10 deletions

File tree

sdks/python/apache_beam/io/avroio_test.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
import os
2020
import tempfile
2121
import unittest
22-
2322
from apache_beam.io import avroio
2423
from apache_beam.io import filebasedsource
24+
from apache_beam.io import source_test_utils
25+
import avro.datafile
2526
from avro.datafile import DataFileWriter
2627
from avro.io import DatumWriter
2728
import avro.schema as avro_schema
@@ -90,16 +91,15 @@ def _run_avro_test(
9091
if len(splits) < 2:
9192
raise ValueError('Test is trivial. Please adjust it so that at least '
9293
'two splits get generated')
93-
for split in splits:
94-
records = [record for record in split.source.read(
95-
split.source.get_range_tracker(split.start_position,
96-
split.stop_position))]
97-
read_records.extend(records)
98-
else:
99-
range_tracker = source.get_range_tracker(None, None)
100-
read_records = [record for record in source.read(range_tracker)]
10194

102-
self.assertItemsEqual(expected_result, read_records)
95+
sources_info = [
96+
(split.source, split.start_position, split.stop_position)
97+
for split in splits]
98+
source_test_utils.assertSourcesEqualReferenceSource(
99+
(source, None, None), sources_info)
100+
else:
101+
read_records = source_test_utils.readFromSource(source, None, None)
102+
self.assertItemsEqual(expected_result, read_records)
103103

104104
def test_read_without_splitting(self):
105105
file_name = self._write_data()
@@ -141,6 +141,21 @@ def test_read_with_splitting_pattern(self):
141141
expected_result = self.RECORDS * 3
142142
self._run_avro_test(pattern, 100, True, expected_result)
143143

144+
def test_dynamic_work_rebalancing_exhaustive(self):
145+
# Adjusting block size so that we can perform a exhaustive dynamic
146+
# work rebalancing test that completes within an acceptable amount of time.
147+
old_sync_interval = avro.datafile.SYNC_INTERVAL
148+
try:
149+
avro.datafile.SYNC_INTERVAL = 5
150+
file_name = self._write_data(count=20)
151+
source = avroio.AvroSource(file_name)
152+
splits = [split for split in source.split(
153+
desired_bundle_size=float('inf'))]
154+
assert len(splits) == 1
155+
source_test_utils.assertSplitAtFractionExhaustive(splits[0].source)
156+
finally:
157+
avro.datafile.SYNC_INTERVAL = old_sync_interval
158+
144159

145160
if __name__ == '__main__':
146161
logging.getLogger().setLevel(logging.INFO)

0 commit comments

Comments
 (0)