Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9b7bbc2

Browse files
Sourabh Bajajaaltay
authored andcommitted
[BEAM-1988] Migrate from utils.path to BFS
1 parent 1dce98f commit 9b7bbc2

8 files changed

Lines changed: 31 additions & 140 deletions

File tree

sdks/python/apache_beam/io/gcp/gcsfilesystem.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,7 @@ def join(self, basepath, *paths):
4646
raise ValueError('Basepath %r must be GCS path.', basepath)
4747
path = basepath
4848
for p in paths:
49-
if path == '' or path.endswith('/'):
50-
path += p
51-
else:
52-
path += '/' + p
49+
path = path.rstrip('/') + '/' + p.lstrip('/')
5350
return path
5451

5552
def mkdirs(self, path):

sdks/python/apache_beam/io/gcp/gcsfilesystem_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,16 @@ def test_join(self):
4242
file_system.join('gs://bucket/path', 'to', 'file'))
4343
self.assertEqual('gs://bucket/path/to/file',
4444
file_system.join('gs://bucket/path', 'to/file'))
45-
self.assertEqual('gs://bucket/path//to/file',
45+
self.assertEqual('gs://bucket/path/to/file',
4646
file_system.join('gs://bucket/path', '/to/file'))
47+
self.assertEqual('gs://bucket/path/to/file',
48+
file_system.join('gs://bucket/path/', 'to', 'file'))
49+
self.assertEqual('gs://bucket/path/to/file',
50+
file_system.join('gs://bucket/path/', 'to/file'))
51+
self.assertEqual('gs://bucket/path/to/file',
52+
file_system.join('gs://bucket/path/', '/to/file'))
53+
with self.assertRaises(ValueError):
54+
file_system.join('/bucket/path/', '/to/file')
4755

4856
@mock.patch('apache_beam.io.gcp.gcsfilesystem.gcsio')
4957
def test_match_multiples(self, mock_gcsio):

sdks/python/apache_beam/runners/dataflow/internal/apiclient.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
from apitools.base.py import encoding
3131
from apitools.base.py import exceptions
3232

33-
from apache_beam import utils
3433
from apache_beam.internal.gcp.auth import get_service_credentials
3534
from apache_beam.internal.gcp.json_value import to_json_value
35+
from apache_beam.io.filesystems_util import get_filesystem
3636
from apache_beam.io.gcp.internal.clients import storage
3737
from apache_beam.runners.dataflow.internal import dependency
3838
from apache_beam.runners.dataflow.internal.clients import dataflow
@@ -336,10 +336,12 @@ def __init__(self, options):
336336
# for GCS staging locations where the potential for such clashes is high.
337337
if self.google_cloud_options.staging_location.startswith('gs://'):
338338
path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time())
339-
self.google_cloud_options.staging_location = utils.path.join(
339+
filesystem = get_filesystem(self.google_cloud_options.staging_location)
340+
self.google_cloud_options.staging_location = filesystem.join(
340341
self.google_cloud_options.staging_location, path_suffix)
341-
self.google_cloud_options.temp_location = utils.path.join(
342+
self.google_cloud_options.temp_location = filesystem.join(
342343
self.google_cloud_options.temp_location, path_suffix)
344+
343345
self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
344346
if self.options.view_as(StandardOptions).streaming:
345347
self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING

sdks/python/apache_beam/runners/dataflow/internal/dependency.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,9 @@
6161
import sys
6262
import tempfile
6363

64-
65-
from apache_beam import utils
6664
from apache_beam import version as beam_version
6765
from apache_beam.internal import pickler
66+
from apache_beam.io.filesystems_util import get_filesystem
6867
from apache_beam.runners.dataflow.internal import names
6968
from apache_beam.utils import processes
7069
from apache_beam.utils.pipeline_options import GoogleCloudOptions
@@ -158,6 +157,7 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir,
158157
name patterns.
159158
"""
160159
resources = []
160+
staging_filesystem = get_filesystem(staging_location)
161161
staging_temp_dir = None
162162
local_packages = []
163163
for package in extra_packages:
@@ -190,13 +190,14 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir,
190190
local_packages.append(package)
191191

192192
if staging_temp_dir:
193+
temp_fs = get_filesystem(staging_temp_dir)
193194
local_packages.extend(
194-
[utils.path.join(staging_temp_dir, f) for f in os.listdir(
195+
[temp_fs.join(staging_temp_dir, f) for f in os.listdir(
195196
staging_temp_dir)])
196197

197198
for package in local_packages:
198199
basename = os.path.basename(package)
199-
staged_path = utils.path.join(staging_location, basename)
200+
staged_path = staging_filesystem.join(staging_location, basename)
200201
file_copy(package, staged_path)
201202
resources.append(basename)
202203
# Create a file containing the list of extra packages and stage it.
@@ -209,7 +210,7 @@ def _stage_extra_packages(extra_packages, staging_location, temp_dir,
209210
with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f:
210211
for package in local_packages:
211212
f.write('%s\n' % os.path.basename(package))
212-
staged_path = utils.path.join(staging_location, EXTRA_PACKAGES_FILE)
213+
staged_path = staging_filesystem.join(staging_location, EXTRA_PACKAGES_FILE)
213214
# Note that the caller of this function is responsible for deleting the
214215
# temporary folder where all temp files are created, including this one.
215216
file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path)
@@ -284,13 +285,15 @@ def stage_job_resources(
284285
raise RuntimeError(
285286
'The --temp_location option must be specified.')
286287

288+
filesystem = get_filesystem(google_cloud_options.staging_location)
289+
287290
# Stage a requirements file if present.
288291
if setup_options.requirements_file is not None:
289292
if not os.path.isfile(setup_options.requirements_file):
290293
raise RuntimeError('The file %s cannot be found. It was specified in the '
291294
'--requirements_file command line option.' %
292295
setup_options.requirements_file)
293-
staged_path = utils.path.join(google_cloud_options.staging_location,
296+
staged_path = filesystem.join(google_cloud_options.staging_location,
294297
REQUIREMENTS_FILE)
295298
file_copy(setup_options.requirements_file, staged_path)
296299
resources.append(REQUIREMENTS_FILE)
@@ -305,7 +308,7 @@ def stage_job_resources(
305308
populate_requirements_cache(
306309
setup_options.requirements_file, requirements_cache_path)
307310
for pkg in glob.glob(os.path.join(requirements_cache_path, '*')):
308-
file_copy(pkg, utils.path.join(google_cloud_options.staging_location,
311+
file_copy(pkg, filesystem.join(google_cloud_options.staging_location,
309312
os.path.basename(pkg)))
310313
resources.append(os.path.basename(pkg))
311314

@@ -324,7 +327,7 @@ def stage_job_resources(
324327
'setup.py instead of %s' % setup_options.setup_file)
325328
tarball_file = _build_setup_package(setup_options.setup_file, temp_dir,
326329
build_setup_args)
327-
staged_path = utils.path.join(google_cloud_options.staging_location,
330+
staged_path = filesystem.join(google_cloud_options.staging_location,
328331
WORKFLOW_TARBALL_FILE)
329332
file_copy(tarball_file, staged_path)
330333
resources.append(WORKFLOW_TARBALL_FILE)
@@ -344,7 +347,7 @@ def stage_job_resources(
344347
pickled_session_file = os.path.join(temp_dir,
345348
names.PICKLED_MAIN_SESSION_FILE)
346349
pickler.dump_session(pickled_session_file)
347-
staged_path = utils.path.join(google_cloud_options.staging_location,
350+
staged_path = filesystem.join(google_cloud_options.staging_location,
348351
names.PICKLED_MAIN_SESSION_FILE)
349352
file_copy(pickled_session_file, staged_path)
350353
resources.append(names.PICKLED_MAIN_SESSION_FILE)
@@ -359,7 +362,7 @@ def stage_job_resources(
359362
else:
360363
stage_tarball_from_remote_location = False
361364

362-
staged_path = utils.path.join(google_cloud_options.staging_location,
365+
staged_path = filesystem.join(google_cloud_options.staging_location,
363366
names.DATAFLOW_SDK_TARBALL_FILE)
364367
if stage_tarball_from_remote_location:
365368
# If --sdk_location is not specified then the appropriate package

sdks/python/apache_beam/runners/dataflow/internal/dependency_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import tempfile
2424
import unittest
2525

26-
from apache_beam import utils
26+
from apache_beam.io.filesystems_util import get_filesystem
2727
from apache_beam.runners.dataflow.internal import dependency
2828
from apache_beam.runners.dataflow.internal import names
2929
from apache_beam.utils.pipeline_options import GoogleCloudOptions
@@ -241,7 +241,8 @@ def override_file_copy(self, expected_from_path, expected_to_dir):
241241
def file_copy(from_path, to_path):
242242
if not from_path.endswith(names.PICKLED_MAIN_SESSION_FILE):
243243
self.assertEqual(expected_from_path, from_path)
244-
self.assertEqual(utils.path.join(expected_to_dir,
244+
filesystem = get_filesystem(expected_to_dir)
245+
self.assertEqual(filesystem.join(expected_to_dir,
245246
names.DATAFLOW_SDK_TARBALL_FILE),
246247
to_path)
247248
if from_path.startswith('gs://') or to_path.startswith('gs://'):

sdks/python/apache_beam/utils/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,3 @@
1616
#
1717

1818
"""A package containing utilities."""
19-
20-
# We must import path here to support the pattern of referencing utils.path
21-
# without needing to explicitly import apache_beam.utils.path.
22-
import path

sdks/python/apache_beam/utils/path.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

sdks/python/apache_beam/utils/path_test.py

Lines changed: 0 additions & 70 deletions
This file was deleted.

0 commit comments

Comments
 (0)