Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 80c0998

Browse files
authored
Use cached prism binary without re-unzipping by default. (apache#34616)
1 parent 7cb43f2 commit 80c0998

2 files changed

Lines changed: 202 additions & 21 deletions

File tree

sdks/python/apache_beam/runners/portability/prism_runner.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -94,16 +94,30 @@ def __init__(self, options):
9494
self._job_port = job_options.job_port
9595

9696
@classmethod
97-
def maybe_unzip_and_make_executable(cls, url: str, bin_cache: str) -> str:
97+
def maybe_unzip_and_make_executable(
98+
cls, url: str, bin_cache: str, ignore_cache: bool = True) -> str:
99+
assert (os.path.isfile(url))
100+
98101
if zipfile.is_zipfile(url):
99-
z = zipfile.ZipFile(url)
100-
url = z.extract(
101-
os.path.splitext(os.path.basename(url))[0], path=bin_cache)
102+
target = os.path.splitext(os.path.basename(url))[0]
103+
target_url = os.path.join(bin_cache, target)
104+
if not ignore_cache and os.path.exists(target_url):
105+
_LOGGER.info(
106+
'Using cached prism binary from %s for %s' % (target_url, url))
107+
else:
108+
# Only unzip the zip file if the url is a zip file and ignore_cache is
109+
# True (cache disabled)
110+
_LOGGER.info("Unzipping prism from %s to %s" % (url, target_url))
111+
z = zipfile.ZipFile(url)
112+
target_url = z.extract(target, path=bin_cache)
113+
else:
114+
target_url = url
102115

116+
_LOGGER.info("Prism binary path resolved to: %s", target_url)
103117
# Make sure the binary is executable.
104-
st = os.stat(url)
105-
os.chmod(url, st.st_mode | stat.S_IEXEC)
106-
return url
118+
st = os.stat(target_url)
119+
os.chmod(target_url, st.st_mode | stat.S_IEXEC)
120+
return target_url
107121

108122
# Finds the bin or zip in the local cache, and if not, fetches it.
109123
@classmethod
@@ -114,29 +128,38 @@ def local_bin(
114128
if bin_cache == '':
115129
bin_cache = cls.BIN_CACHE
116130
if os.path.exists(url):
117-
_LOGGER.info('Using local prism binary from %s' % url)
118-
return cls.maybe_unzip_and_make_executable(url, bin_cache=bin_cache)
131+
_LOGGER.info('Using local prism binary/zip from %s' % url)
132+
cached_file = url
119133
else:
120-
cached_bin = os.path.join(bin_cache, os.path.basename(url))
121-
if os.path.exists(cached_bin) and not ignore_cache:
122-
_LOGGER.info('Using cached prism binary from %s' % url)
134+
cached_file = os.path.join(bin_cache, os.path.basename(url))
135+
if os.path.exists(cached_file) and not ignore_cache:
136+
_LOGGER.info(
137+
'Using cached prism binary/zip from %s for %s' % (cached_file, url))
123138
else:
124-
_LOGGER.info('Downloading prism binary from %s' % url)
139+
_LOGGER.info('Downloading prism from %s' % url)
125140
if not os.path.exists(bin_cache):
126141
os.makedirs(bin_cache)
127142
try:
128143
try:
129144
url_read = FileSystems.open(url)
130145
except ValueError:
131146
url_read = urlopen(url)
132-
with open(cached_bin + '.tmp', 'wb') as zip_write:
147+
with open(cached_file + '.tmp', 'wb') as zip_write:
133148
shutil.copyfileobj(url_read, zip_write, length=1 << 20)
134-
os.rename(cached_bin + '.tmp', cached_bin)
149+
if os.path.isfile(cached_file):
150+
# Remove existing binary to prevent exception on Windows during
151+
# os.rename.
152+
# See: https://docs.python.org/3/library/os.html#os.rename
153+
os.remove(cached_file)
154+
os.rename(cached_file + '.tmp', cached_file)
135155
except URLError as e:
136156
raise RuntimeError(
137157
'Unable to fetch remote prism binary at %s: %s' % (url, e))
138-
return cls.maybe_unzip_and_make_executable(
139-
cached_bin, bin_cache=bin_cache)
158+
# If we download a new prism, then we should always use it but not
159+
# the cached one.
160+
ignore_cache = True
161+
return cls.maybe_unzip_and_make_executable(
162+
cached_file, bin_cache=bin_cache, ignore_cache=ignore_cache)
140163

141164
def construct_download_url(self, root_tag: str, sys: str, mach: str) -> str:
142165
"""Construct the prism download URL with the appropriate release tag.

sdks/python/apache_beam/runners/portability/prism_runner_test.py

Lines changed: 162 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,25 @@
1818

1919
import argparse
2020
import logging
21+
import os.path
2122
import shlex
2223
import typing
2324
import unittest
25+
import zipfile
2426
from os import linesep
25-
from os import path
2627
from os.path import exists
2728
from shutil import rmtree
2829
from tempfile import mkdtemp
30+
from unittest import mock
2931

3032
import pytest
33+
from parameterized import parameterized
3134

3235
import apache_beam as beam
3336
from apache_beam.options.pipeline_options import DebugOptions
3437
from apache_beam.options.pipeline_options import PortableOptions
3538
from apache_beam.runners.portability import portable_runner_test
39+
from apache_beam.runners.portability import prism_runner
3640
from apache_beam.testing.util import assert_that
3741
from apache_beam.testing.util import equal_to
3842

@@ -119,10 +123,10 @@ def _create_conf_dir(cls):
119123
cls.conf_dir = mkdtemp(prefix='prismtest-conf')
120124

121125
# path for a FileReporter to write metrics to
122-
cls.test_metrics_path = path.join(cls.conf_dir, 'test-metrics.txt')
126+
cls.test_metrics_path = os.path.join(cls.conf_dir, 'test-metrics.txt')
123127

124128
# path to write Prism configuration to
125-
conf_path = path.join(cls.conf_dir, 'prism-conf.yaml')
129+
conf_path = os.path.join(cls.conf_dir, 'prism-conf.yaml')
126130
file_reporter = 'org.apache.beam.runners.prism.metrics.FileReporter'
127131
with open(conf_path, 'w') as f:
128132
f.write(
@@ -220,8 +224,162 @@ def test_custom_window_type(self):
220224
def test_metrics(self):
221225
super().test_metrics(check_bounded_trie=False)
222226

227+
# Inherits all other tests.
228+
229+
230+
class PrismJobServerTest(unittest.TestCase):
231+
def setUp(self) -> None:
232+
self.local_dir = mkdtemp()
233+
self.cache_dir = os.path.join(self.local_dir, "cache")
234+
os.mkdir(self.cache_dir)
235+
236+
self.job_server = prism_runner.PrismJobServer(options=PortableOptions())
237+
self.local_bin_path = os.path.join(self.local_dir, "my_prism_bin")
238+
self.local_zip_path = os.path.join(self.local_dir, "my_prism_bin.zip")
239+
self.cache_bin_path = os.path.join(self.cache_dir, "my_prism_bin")
240+
self.cache_zip_path = os.path.join(self.cache_dir, "my_prism_bin.zip")
241+
self.remote_zip_path = "https://github.com/apache/beam/releases/download/fake_ver/my_prism_bin.zip" # pylint: disable=line-too-long
242+
243+
def tearDown(self) -> None:
244+
rmtree(self.local_dir)
245+
pass
246+
247+
def _make_local_bin(self):
248+
with open(self.local_bin_path, 'wb'):
249+
pass
250+
251+
def _make_local_zip(self):
252+
with zipfile.ZipFile(self.local_zip_path, 'w', zipfile.ZIP_DEFLATED):
253+
pass
254+
255+
def _make_cache_bin(self):
256+
with open(self.cache_bin_path, 'wb'):
257+
pass
258+
259+
def _make_cache_zip(self):
260+
with zipfile.ZipFile(self.cache_zip_path, 'w', zipfile.ZIP_DEFLATED):
261+
pass
262+
263+
def _extract_side_effect(self, a, path=None):
264+
if path is None:
265+
return a
266+
267+
if path.startswith(self.cache_dir):
268+
self._make_cache_bin()
269+
else:
270+
self._make_local_bin()
271+
272+
return os.path.join(str(path), a)
273+
274+
@parameterized.expand([[True, True], [True, False], [False, True],
275+
[False, False]])
276+
def test_with_unknown_path(self, custom_bin_cache, ignore_cache):
277+
self.assertRaises(
278+
FileNotFoundError,
279+
lambda: self.job_server.local_bin(
280+
"/path/unknown",
281+
bin_cache=self.cache_dir if custom_bin_cache else '',
282+
ignore_cache=ignore_cache))
283+
284+
@parameterized.expand([
285+
[True, True, True],
286+
[True, True, False],
287+
[True, False, True],
288+
[True, False, False],
289+
[False, True, True],
290+
[False, True, False],
291+
[False, False, True],
292+
[False, False, False],
293+
])
294+
def test_with_local_binary_and_zip(
295+
self, has_cache_bin, has_cache_zip, ignore_cache):
296+
self._make_local_bin()
297+
self._make_local_zip()
298+
if has_cache_bin:
299+
self._make_cache_bin()
300+
if has_cache_zip:
301+
self._make_cache_zip()
302+
303+
with mock.patch('zipfile.is_zipfile') as mock_is_zipfile:
304+
with mock.patch('zipfile.ZipFile') as mock_zipfile_init:
305+
mock_zipfile = mock.MagicMock()
306+
mock_zipfile.extract = mock.Mock(side_effect=self._extract_side_effect)
307+
mock_zipfile_init.return_value = mock_zipfile
308+
309+
# path is set to local binary
310+
# always use local binary even if we have a cached copy, no unzipping
311+
mock_is_zipfile.return_value = False
312+
self.assertEqual(
313+
self.job_server.local_bin(
314+
self.local_bin_path, self.cache_dir, ignore_cache),
315+
self.local_bin_path)
316+
317+
mock_zipfile_init.assert_not_called()
318+
319+
# path is set to local zip
320+
# use local zip and unzip only if cache binary not available or
321+
# ignore_cache is true
322+
mock_is_zipfile.return_value = True
323+
self.assertEqual(
324+
self.job_server.local_bin(
325+
self.local_zip_path, self.cache_dir, ignore_cache),
326+
self.cache_bin_path)
327+
328+
if has_cache_bin and not ignore_cache:
329+
# if cache is enabled and binary is in cache, we wont't unzip
330+
mock_zipfile_init.assert_not_called()
331+
else:
332+
mock_zipfile_init.assert_called_once()
333+
mock_zipfile_init.reset_mock()
334+
335+
@parameterized.expand([
336+
[True, True, True],
337+
[True, True, False],
338+
[True, False, True],
339+
[True, False, False],
340+
[False, True, True],
341+
[False, True, False],
342+
[False, False, True],
343+
[False, False, False],
344+
])
345+
def test_with_remote_path(self, has_cache_bin, has_cache_zip, ignore_cache):
346+
if has_cache_bin:
347+
self._make_cache_bin()
348+
if has_cache_zip:
349+
self._make_cache_zip()
350+
351+
with mock.patch(
352+
'apache_beam.runners.portability.prism_runner.urlopen') as mock_urlopen:
353+
mock_response = mock.MagicMock()
354+
mock_response.read.return_value = b''
355+
mock_urlopen.return_value = mock_response
356+
with mock.patch('zipfile.is_zipfile') as mock_is_zipfile:
357+
with mock.patch('zipfile.ZipFile') as mock_zipfile_init:
358+
mock_zipfile = mock.MagicMock()
359+
mock_zipfile.extract = mock.Mock(
360+
side_effect=self._extract_side_effect)
361+
mock_zipfile_init.return_value = mock_zipfile
362+
mock_is_zipfile.return_value = True
363+
self.assertEqual(
364+
self.job_server.local_bin(
365+
self.remote_zip_path,
366+
self.cache_dir,
367+
ignore_cache=ignore_cache),
368+
self.cache_bin_path)
369+
370+
if has_cache_zip and not ignore_cache:
371+
# if cache is enabled and zip is in cache, we wont't download
372+
mock_urlopen.assert_not_called()
373+
else:
374+
mock_urlopen.assert_called_once()
375+
376+
if has_cache_bin and has_cache_zip and not ignore_cache:
377+
# if cache is enabled and both binary and zip are in cache, we
378+
# wont't unzip
379+
mock_zipfile_init.assert_not_called()
380+
else:
381+
mock_zipfile_init.assert_called_once()
223382

224-
# Inherits all other tests.
225383

226384
if __name__ == '__main__':
227385
# Run the tests.

0 commit comments

Comments
 (0)