# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """Tests for transforms defined in apache_beam.io.fileio.""" # pytype: skip-file import csv import io import json import logging import os import unittest import uuid import warnings import pytest from hamcrest.library.text import stringmatches import apache_beam as beam from apache_beam.io import fileio from apache_beam.io.filebasedsink_test import _TestCaseWithTempDirCleanUp from apache_beam.io.filesystem import CompressionTypes from apache_beam.io.filesystems import FileSystems from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.test_stream import TestStream from apache_beam.testing.test_utils import compute_hash from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to from apache_beam.testing.util import matches_all from apache_beam.transforms import trigger from apache_beam.transforms.window import FixedWindows from apache_beam.transforms.window import GlobalWindow from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.timestamp import Timestamp warnings.filterwarnings( 'ignore', category=FutureWarning, module='apache_beam.io.fileio_test') def _get_file_reader(readable_file): return io.TextIOWrapper(readable_file.open()) class MatchTest(_TestCaseWithTempDirCleanUp): def test_basic_two_files(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir)) files.append(self._create_temp_file(dir=tempdir)) with TestPipeline() as p: files_pc = ( p | fileio.MatchFiles(FileSystems.join(tempdir, '*')) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files)) def test_match_all_two_directories(self): files = [] directories = [] for _ in range(2): # TODO: What about this having to append the ending slash? d = '%s%s' % (self._new_tempdir(), os.sep) directories.append(d) files.append(self._create_temp_file(dir=d)) files.append(self._create_temp_file(dir=d)) with TestPipeline() as p: files_pc = ( p | beam.Create([FileSystems.join(d, '*') for d in directories]) | fileio.MatchAll() | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files)) def test_match_files_one_directory_failure1(self): directories = [ '%s%s' % (self._new_tempdir(), os.sep), '%s%s' % (self._new_tempdir(), os.sep) ] files = [] files.append(self._create_temp_file(dir=directories[0])) files.append(self._create_temp_file(dir=directories[0])) with self.assertRaisesRegex(Exception, "Empty match for pattern"): with TestPipeline() as p: files_pc = ( p | beam.Create([FileSystems.join(d, '*') for d in directories]) | fileio.MatchAll(fileio.EmptyMatchTreatment.DISALLOW) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files)) def test_match_files_one_directory_failure2(self): directories = [ '%s%s' % (self._new_tempdir(), os.sep), '%s%s' % (self._new_tempdir(), os.sep) ] files = [] files.append(self._create_temp_file(dir=directories[0])) files.append(self._create_temp_file(dir=directories[0])) with TestPipeline() as p: files_pc = ( p | beam.Create([FileSystems.join(d, '*') for d in directories]) | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files)) class ReadTest(_TestCaseWithTempDirCleanUp): def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines())) assert_that(content_pc, equal_to(content.splitlines())) def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that(content_pc, equal_to(rows)) def test_infer_compressed_file(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f: f.write(file_contents) file_contents2 = b'compressed_contents_bz2!' import bz2 with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f: f.write(file_contents2) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open().readline())) assert_that(content_pc, equal_to([file_contents, file_contents2])) def test_read_bz2_compressed_file_without_suffix(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import bz2 with bz2.BZ2File(os.path.join(dir, 'compressed'), 'w') as f: f.write(file_contents) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map( lambda rf: rf.open(compression_type=CompressionTypes.BZIP2).read( len(file_contents)))) assert_that(content_pc, equal_to([file_contents])) def test_read_gzip_compressed_file_without_suffix(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f: f.write(file_contents) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map( lambda rf: rf.open(compression_type=CompressionTypes.GZIP).read( len(file_contents)))) assert_that(content_pc, equal_to([file_contents])) def test_string_filenames_and_skip_directory(self): content = 'thecontent\n' files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir, content=content)) files.append(self._create_temp_file(dir=tempdir, content=content)) with TestPipeline() as p: contents_pc = ( p | beam.Create(files + ['%s/' % tempdir]) | fileio.ReadMatches() | beam.FlatMap(lambda x: x.read().decode('utf-8').splitlines())) assert_that(contents_pc, equal_to(content.splitlines() * 2)) def test_fail_on_directories(self): content = 'thecontent\n' files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir, content=content)) files.append(self._create_temp_file(dir=tempdir, content=content)) with self.assertRaisesRegex(Exception, "Directories are not allowed"): with TestPipeline() as p: _ = ( p | beam.Create(files + ['%s/' % tempdir]) | fileio.ReadMatches(skip_directories=False) | beam.Map(lambda x: x.read_utf8())) class MatchIntegrationTest(unittest.TestCase): INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87' INPUT_FILE_LARGE = ( 'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json') WIKI_FILES = [ 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json', ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) @pytest.mark.it_postcommit def test_transform_on_gcs(self): args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: matches_pc = ( p | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) | fileio.MatchAll() | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) assert_that( matches_pc, equal_to([self.INPUT_FILE] + self.WIKI_FILES), label='Matched Files') checksum_pc = ( p | 'SingleFile' >> beam.Create([self.INPUT_FILE]) | 'MatchOneAll' >> fileio.MatchAll() | fileio.ReadMatches() | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) | 'Checksums' >> beam.Map(compute_hash)) assert_that( checksum_pc, equal_to([self.KINGLEAR_CHECKSUM]), label='Assert Checksums') class MatchContinuouslyTest(_TestCaseWithTempDirCleanUp): def test_with_deduplication(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a file to be matched before pipeline files.append(self._create_temp_file(dir=tempdir)) # Add file name that will be created mid-pipeline files.append(FileSystems.join(tempdir, 'extra')) interval = 0.2 start = Timestamp.now() stop = start + interval + 0.1 def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path with TestPipeline() as p: match_continiously = ( p | fileio.MatchContinuously( file_pattern=FileSystems.join(tempdir, '*'), interval=interval, start_timestamp=start, stop_timestamp=stop) | beam.Map(_create_extra_file)) assert_that(match_continiously, equal_to(files)) def test_without_deduplication(self): interval = 0.2 start = Timestamp.now() stop = start + interval + 0.1 files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a file to be matched before pipeline starts file = self._create_temp_file(dir=tempdir) # Add file twice, since it will be matched for every interval files += [file, file] # Add file name that will be created mid-pipeline files.append(FileSystems.join(tempdir, 'extra')) def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path with TestPipeline() as p: match_continiously = ( p | fileio.MatchContinuously( file_pattern=FileSystems.join(tempdir, '*'), interval=interval, has_deduplication=False, start_timestamp=start, stop_timestamp=stop) | beam.Map(_create_extra_file)) assert_that(match_continiously, equal_to(files)) def test_match_updated_files(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) def _create_extra_file(element): writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() return element.path # Create two files to be matched before pipeline files.append(self._create_temp_file(dir=tempdir)) writer = FileSystems.create(FileSystems.join(tempdir, 'extra')) writer.close() # Add file name that will be created mid-pipeline files.append(FileSystems.join(tempdir, 'extra')) files.append(FileSystems.join(tempdir, 'extra')) interval = 0.2 start = Timestamp.now() stop = start + interval + 0.1 with TestPipeline() as p: match_continiously = ( p | fileio.MatchContinuously( file_pattern=FileSystems.join(tempdir, '*'), interval=interval, start_timestamp=start, stop_timestamp=stop, match_updated_files=True) | beam.Map(_create_extra_file)) assert_that(match_continiously, equal_to(files)) class WriteFilesTest(_TestCaseWithTempDirCleanUp): SIMPLE_COLLECTION = [ { 'project': 'beam', 'foundation': 'apache' }, { 'project': 'prometheus', 'foundation': 'cncf' }, { 'project': 'flink', 'foundation': 'apache' }, { 'project': 'grpc', 'foundation': 'cncf' }, { 'project': 'spark', 'foundation': 'apache' }, { 'project': 'kubernetes', 'foundation': 'cncf' }, { 'project': 'spark', 'foundation': 'apache' }, { 'project': 'knative', 'foundation': 'cncf' }, { 'project': 'linux', 'foundation': 'linux' }, ] LARGER_COLLECTION = ['{:05d}'.format(i) for i in range(200)] CSV_HEADERS = ['project', 'foundation'] SIMPLE_COLLECTION_VALIDATION_SET = {(elm['project'], elm['foundation']) for elm in SIMPLE_COLLECTION} class CsvSink(fileio.TextSink): def __init__(self, headers): self.headers = headers def write(self, record): self._fh.write(','.join([record[h] for h in self.headers]).encode('utf8')) self._fh.write('\n'.encode('utf8')) class JsonSink(fileio.TextSink): def write(self, record): self._fh.write(json.dumps(record).encode('utf8')) self._fh.write('\n'.encode('utf8')) def test_write_to_single_file_batch(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | beam.io.fileio.WriteToFiles(path=dir)) with TestPipeline() as p: result = ( p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) assert_that(result, equal_to([row for row in self.SIMPLE_COLLECTION])) def test_write_to_dynamic_destination(self): sink_params = [ fileio.TextSink, # pass a type signature fileio.TextSink() # pass a FileSink object ] # Test assumes that all records will be handled by same worker process, # pin to FnApiRunner to guarantee hthis runner = 'FnApiRunner' for sink in sink_params: dir = self._new_tempdir() with TestPipeline(runner) as p: _ = ( p | "Create" >> beam.Create(range(100)) | beam.Map(lambda x: str(x)) | fileio.WriteToFiles( path=dir, destination=lambda n: "odd" if int(n) % 2 else "even", sink=sink, file_naming=fileio.destination_prefix_naming("test"))) with TestPipeline(runner) as p: result = ( p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.Map( lambda f: ( os.path.basename(f.metadata.path).split('-')[0], sorted( map(int, f.read_utf8().strip().split('\n')))))) assert_that( result, equal_to([('odd', list(range(1, 100, 2))), ('even', list(range(0, 100, 2)))])) def test_write_to_different_file_types_some_spilling(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=fileio.destination_prefix_naming(), max_writers_per_bundle=1)) with TestPipeline() as p: cncf_res = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) apache_res = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ReadApache" >> fileio.ReadMatches() | "MapApache" >> beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that( cncf_res, equal_to([ row for row in self.SIMPLE_COLLECTION if row['foundation'] == 'cncf' ]), label='verifyCNCF') assert_that( apache_res, equal_to([[row['project'], row['foundation']] for row in self.SIMPLE_COLLECTION if row['foundation'] == 'apache']), label='verifyApache') @unittest.skip('https://github.com/apache/beam/issues/21269') def test_find_orphaned_files(self): dir = self._new_tempdir() write_transform = beam.io.fileio.WriteToFiles(path=dir) def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join( temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | write_transform) # Pre-create the temp directory. temp_dir_path = FileSystems.mkdirs( FileSystems.join(dir, write_transform._temp_directory.get())) write_orphaned_file( write_transform._temp_directory.get(), (None, GlobalWindow())) f2 = write_orphaned_file( write_transform._temp_directory.get(), ('other-dest', GlobalWindow())) temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get()) leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)]) found_files = [m.path for m in leftovers[0].metadata_list] self.assertListEqual(found_files, [f2]) def test_write_to_different_file_types(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=fileio.destination_prefix_naming())) with TestPipeline() as p: cncf_res = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) apache_res = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ReadApache" >> fileio.ReadMatches() | "MapApache" >> beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that( cncf_res, equal_to([ row for row in self.SIMPLE_COLLECTION if row['foundation'] == 'cncf' ]), label='verifyCNCF') assert_that( apache_res, equal_to([[row['project'], row['foundation']] for row in self.SIMPLE_COLLECTION if row['foundation'] == 'apache']), label='verifyApache') def record_dofn(self): class RecordDoFn(beam.DoFn): def process(self, element): WriteFilesTest.all_records.append(element) return RecordDoFn() def test_streaming_complex_timing(self): # Use state on the TestCase class, since other references would be pickled # into a closure and not have the desired side effects. # # TODO(https://github.com/apache/beam/issues/18987): Use assert_that after # it works for the cases here in streaming mode. WriteFilesTest.all_records = [] dir = '%s%s' % (self._new_tempdir(), os.sep) # Setting up the input (TestStream) ts = TestStream().advance_watermark_to(0) for elm in WriteFilesTest.LARGER_COLLECTION: timestamp = int(elm) ts.add_elements([('key', '%s' % elm)]) if timestamp % 5 == 0 and timestamp != 0: # TODO(https://github.com/apache/beam/issues/18721): Add many firings # per window after getting PaneInfo. ts.advance_processing_time(5) ts.advance_watermark_to(timestamp) ts.advance_watermark_to_infinity() def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') # The pipeline that we are testing options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: res = ( p | ts | beam.WindowInto( FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) # Triggering after 5 processing-time seconds, and on the watermark. Also # discarding old elements. _ = ( res | beam.io.fileio.WriteToFiles( path=dir, file_naming=no_colon_file_naming, max_writers_per_bundle=0) | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) | beam.ParDo(self.record_dofn())) # Verification pipeline with TestPipeline() as p: files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) file_names = (files | beam.Map(lambda fm: fm.path)) file_contents = ( files | beam.io.fileio.ReadMatches() | beam.Map( lambda rf: (rf.metadata.path, rf.read_utf8().strip().split('\n'))) ) content = ( file_contents | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) assert_that( file_names, equal_to(WriteFilesTest.all_records), label='AssertFilesMatch') assert_that( content, matches_all(WriteFilesTest.LARGER_COLLECTION), label='AssertContentsMatch') def test_streaming_different_file_types(self): dir = self._new_tempdir() input = iter(WriteFilesTest.SIMPLE_COLLECTION) ts = ( TestStream().advance_watermark_to(0).add_elements( [next(input), next(input)]).advance_watermark_to(10).add_elements( [next(input), next(input)]).advance_watermark_to(20).add_elements([ next(input), next(input) ]).advance_watermark_to(30).add_elements([ next(input), next(input) ]).advance_watermark_to(40).advance_watermark_to_infinity()) def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') with TestPipeline() as p: _ = ( p | ts | beam.WindowInto(FixedWindows(10)) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=no_colon_file_naming, max_writers_per_bundle=0, )) with TestPipeline() as p: cncf_files = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | "CncfFileNames" >> beam.Map(lambda fm: fm.path)) apache_files = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ApacheFileNames" >> beam.Map(lambda fm: fm.path)) assert_that( cncf_files, matches_all([ stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40.*') ]), label='verifyCNCFFiles') assert_that( apache_files, matches_all([ stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10.*'), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20.*'), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30.*'), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40.*') ]), label='verifyApacheFiles') def test_shard_naming(self): namer = fileio.default_file_naming(prefix='/path/to/file', suffix='.txt') self.assertEqual( namer(GlobalWindow(), None, None, None, None, None), '/path/to/file.txt') self.assertEqual( namer(GlobalWindow(), None, 1, 5, None, None), '/path/to/file-00001-of-00005.txt') self.assertEqual( namer(GlobalWindow(), None, 1, 5, 'gz', None), '/path/to/file-00001-of-00005.txt.gz') self.assertEqual( namer(IntervalWindow(0, 100), None, 1, 5, None, None), '/path/to/file' '-1970-01-01T00:00:00-1970-01-01T00:01:40-00001-of-00005.txt') if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) unittest.main()