From b40b76df5d752e0b128b921b648876d5127a2bb1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Sun, 13 Feb 2022 15:04:27 +0100 Subject: [PATCH 1/5] resmgr: properly implement --overwrite, fix #690 --- ocrd/ocrd/resource_manager.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 06832e0fd2..9c2d34cd53 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,7 +1,7 @@ from pathlib import Path from os.path import join -from os import environ, listdir, getcwd, path -from shutil import copytree +from os import environ, listdir, getcwd, path, unlink +from shutil import copytree as copytree_, rmtree from datetime import datetime from tarfile import open as open_tarfile from urllib.parse import urlparse, unquote @@ -14,6 +14,11 @@ from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT +def copytree(src, dst, *args, overwrite=False, **kwargs): + if overwrite: + rmtree(dst) + return copytree_(src, dst, *args, **kwargs) + class OcrdResourceManager(): """ @@ -191,9 +196,11 @@ def parameter_usage(self, name, usage='as-is'): return Path(name).stem raise ValueError("No such usage '%s'" % usage) - def _download_impl(self, url, filename, progress_cb=None, size=None): + def _download_impl(self, url, filename, progress_cb=None, size=None, overwrite=False): log = getLogger('ocrd.resource_manager._download_impl') log.info("Downloading %s to %s" % (url, filename)) + if Path(filename).exists() and overwrite: + unlink(filename) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: total = size if size else int(r.headers.get('content-length')) @@ -202,9 +209,11 @@ def _download_impl(self, url, filename, progress_cb=None, size=None): progress_cb(len(data)) f.write(data) - def _copy_impl(self, src_filename, filename, progress_cb=None): + def _copy_impl(self, src_filename, filename, progress_cb=None, overwrite=False): log = getLogger('ocrd.resource_manager._copy_impl') log.info("Copying %s" % src_filename) + if Path(filename).exists() and overwrite: + unlink(filename) with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: while True: chunk = f_in.read(4096) @@ -245,22 +254,22 @@ def download( destdir.mkdir(parents=True, exist_ok=True) if resource_type == 'file': if is_url: - self._download_impl(url, fpath, progress_cb) + self._download_impl(url, fpath, progress_cb, overwrite=overwrite) else: - self._copy_impl(url, fpath, progress_cb) + self._copy_impl(url, fpath, progress_cb, overwrite=overwrite) elif resource_type == 'tarball': with pushd_popd(tempdir=True): if is_url: - self._download_impl(url, 'download.tar.xx', progress_cb, size) + self._download_impl(url, 'download.tar.xx', progress_cb, size, overwrite=overwrite) else: - self._copy_impl(url, 'download.tar.xx', progress_cb) + self._copy_impl(url, 'download.tar.xx', progress_cb, overwrite=overwrite) Path('out').mkdir() with pushd_popd('out'): log.info("Extracting tarball") with open_tarfile('../download.tar.xx', 'r:*') as tar: tar.extractall() log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath)) - copytree(path_in_archive, str(fpath)) + copytree(path_in_archive, str(fpath), overwrite=overwrite) # TODO # elif resource_type == 'github-dir': return fpath From 9bc3aed85e7662342228025e443f26a6229ea321 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Dec 2022 18:42:40 +0100 Subject: [PATCH 2/5] resmgr: simplify --overwrite logic --- ocrd/ocrd/cli/resmgr.py | 37 +++++++++++++++++++---------------- ocrd/ocrd/resource_manager.py | 34 +++++++++----------------------- 2 files changed, 29 insertions(+), 42 deletions(-) diff --git a/ocrd/ocrd/cli/resmgr.py b/ocrd/ocrd/cli/resmgr.py index a2ddddf470..c41892a87b 100644 --- a/ocrd/ocrd/cli/resmgr.py +++ b/ocrd/ocrd/cli/resmgr.py @@ -151,23 +151,26 @@ def download(any_url, no_dynamic, resource_type, path_in_archive, allow_uninstal if not basedir: basedir = resmgr.location_to_resource_dir('data') - with click.progressbar(length=resdict['size']) as bar: - fpath = resmgr.download( - this_executable, - resdict['url'], - name=resdict['name'], - resource_type=resdict.get('type', resource_type), - path_in_archive=resdict.get('path_in_archive', path_in_archive), - overwrite=overwrite, - no_subdir=location in ['cwd', 'module'], - basedir=basedir, - progress_cb=lambda delta: bar.update(delta) - ) - if registered == 'unregistered': - log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, name, any_url, resmgr.user_list) - resmgr.add_to_user_database(this_executable, fpath, url=any_url) - resmgr.save_user_list() - log.info("Installed resource %s under %s", resdict['url'], fpath) + try: + with click.progressbar(length=resdict['size']) as bar: + fpath = resmgr.download( + this_executable, + resdict['url'], + name=resdict['name'], + resource_type=resdict.get('type', resource_type), + path_in_archive=resdict.get('path_in_archive', path_in_archive), + overwrite=overwrite, + no_subdir=location in ['cwd', 'module'], + basedir=basedir, + progress_cb=lambda delta: bar.update(delta) + ) + if registered == 'unregistered': + log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, name, any_url, resmgr.user_list) + resmgr.add_to_user_database(this_executable, fpath, url=any_url) + resmgr.save_user_list() + log.info("Installed resource %s under %s", resdict['url'], fpath) + except FileExistsError as exc: + log.info(str(exc)) log.info("Use in parameters as '%s'", resmgr.parameter_usage(resdict['name'], usage=resdict.get('parameter_usage', 'as-is'))) @resmgr_cli.command('migrate') diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 62aa2c5031..2fb4bbabd8 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -25,11 +25,6 @@ from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT -def copytree(src, dst, *args, overwrite=False, **kwargs): - if overwrite: - rmtree(dst) - return copytree_(src, dst, *args, **kwargs) - class OcrdResourceManager(): """ @@ -242,13 +237,6 @@ def parameter_usage(self, name, usage='as-is'): def _download_impl(self, url, filename, progress_cb=None, size=None, overwrite=False): log = getLogger('ocrd.resource_manager._download_impl') log.info("Downloading %s to %s" % (url, filename)) - if Path(filename).exists(): - if not overwrite: - raise FileExistsError("%s %s already exists but overwrite is not set" % ('Directory' if Path(filename).is_dir() else 'File', filename)) - if Path(filename).is_dir(): - rmtree(filename) - else: - unlink(filename) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: for data in r.iter_content(chunk_size=4096): @@ -260,11 +248,6 @@ def _copy_impl(self, src_filename, filename, progress_cb=None, overwrite=False): log = getLogger('ocrd.resource_manager._copy_impl') log.info("Copying %s to %s", src_filename, filename) if Path(src_filename).is_dir(): - if Path(filename).exists(): - if not overwrite: - raise FileExistsError("Directory %s already exists but overwrite is not set" % filename) - log.info("Removing existing target directory %s", filename) - rmtree(filename) log.info(f"Copying recursively from {src_filename} to {filename}") for child in Path(src_filename).rglob('*'): child_dst = Path(filename) / child.relative_to(src_filename) @@ -279,11 +262,6 @@ def _copy_impl(self, src_filename, filename, progress_cb=None, overwrite=False): else: break else: - if Path(filename).exists(): - if not overwrite: - raise FileExistsError("File %s already exists but overwrite is not set" % filename) - log.info("Removing existing target file %s", filename) - unlink(filename) with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in: while True: chunk = f_in.read(4096) @@ -317,9 +295,15 @@ def download( name = Path(unquote(url_parsed.path)).name fpath = Path(destdir, name) is_url = url.startswith('https://') or url.startswith('http://') - if fpath.exists() and not overwrite: - log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath)) - return fpath + if fpath.exists(): + if not overwrite: + raise FileExistsError("%s %s already exists but --overwrite is not set" % ('Directory' if fpath.is_dir() else 'File', fpath)) + if fpath.is_dir(): + log.info("Removing existing target directory {fpath}") + rmtree(str(fpath)) + else: + log.info("Removing existing target file {fpath}") + unlink(str(fpath)) destdir.mkdir(parents=True, exist_ok=True) if resource_type in ('file', 'directory'): if is_url: From ec463b60ed676547d1748d714a8f84dacc68a4d7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Dec 2022 18:43:05 +0100 Subject: [PATCH 3/5] tests: adapt to resmgr --overwrite change, speed up --- tests/cli/test_resmgr.py | 16 +++++++++------- tests/test_resource_manager.py | 10 +++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py index 46f18336e0..2d746580b0 100644 --- a/tests/cli/test_resmgr.py +++ b/tests/cli/test_resmgr.py @@ -8,10 +8,11 @@ from ocrd.resource_manager import OcrdResourceManager runner = CliRunner() -executable = 'ocrd-tesserocr-recognize' +executable = 'ocrd-dummy' @fixture def mgr_with_tmp_path(tmp_path): + print(tmp_path) mgr = OcrdResourceManager(xdg_data_home=tmp_path, userdir=tmp_path, xdg_config_home=tmp_path) env = {'XDG_DATA_HOME': str(tmp_path), 'XDG_CONFIG_HOME': str(tmp_path)} return tmp_path, mgr, env @@ -21,8 +22,8 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): We should add a test for the -n URL TOOL NAME use-case as well (both as an unregistered resource and as URL-override). """ tmp_path, mgr, env = mgr_with_tmp_path - print(mgr.list_installed('ocrd-tesserocr-recognize')[0][1]) - rsrcs_before = len(mgr.list_installed('ocrd-tesserocr-recognize')[0][1]) + print(mgr.list_installed(executable)[0][1]) + rsrcs_before = len(mgr.list_installed(executable)[0][1]) # add an unregistered resource url = 'https://github.com/tesseract-ocr/tessdata_best/raw/main/dzo.traineddata' @@ -30,7 +31,7 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url, executable, name], env=env) mgr.load_resource_list(mgr.user_list) - rsrcs = mgr.list_installed('ocrd-tesserocr-recognize')[0][1] + rsrcs = mgr.list_installed(executable)[0][1] assert len(rsrcs) == rsrcs_before + 1 assert rsrcs[0]['name'] == name assert rsrcs[0]['url'] == url @@ -38,12 +39,13 @@ def test_url_tool_name_unregistered(mgr_with_tmp_path): # add resource with different URL but same name url2 = url.replace('dzo', 'bos') r = runner.invoke(resmgr_cli, ['download', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists and overwrite is False' in r.output + assert 'already exists but --overwrite is not set' in r.output r = runner.invoke(resmgr_cli, ['download', '--overwrite', '--allow-uninstalled', '--any-url', url2, executable, name], env=env) - assert 'already exists and overwrite is False' not in r.output + assert 'already exists but --overwrite is not set' not in r.output + mgr.load_resource_list(mgr.user_list) - rsrcs = mgr.list_installed('ocrd-tesserocr-recognize')[0][1] + rsrcs = mgr.list_installed(executable)[0][1] print(rsrcs) assert len(rsrcs) == rsrcs_before + 1 assert rsrcs[0]['name'] == name diff --git a/tests/test_resource_manager.py b/tests/test_resource_manager.py index f889a4abf0..6d34fa8dc9 100644 --- a/tests/test_resource_manager.py +++ b/tests/test_resource_manager.py @@ -32,7 +32,7 @@ def test_resources_manager_config_default(monkeypatch, tmp_path): assert mgr.add_to_user_database('ocrd-foo', f) # pdb.set_trace() - mgr.list_installed() + mgr.list_installed('ocrd-foo') proc = 'ocrd-tesserocr-recognize' # TODO mock request fpath = mgr.download(proc, CONST_RESOURCE_URL_LAYOUT, mgr.location_to_resource_dir('data')) @@ -55,7 +55,7 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): assert f.exists() assert f == mgr.user_list assert mgr.add_to_user_database('ocrd-foo', f) - mgr.list_installed() + mgr.list_installed('ocrd-foo') proc = 'ocrd-tesserocr-recognize' fpath = mgr.download(proc, CONST_RESOURCE_URL_LAYOUT, mgr.location_to_resource_dir('data')) assert fpath.exists() @@ -66,14 +66,14 @@ def test_resources_manager_from_environment(tmp_path, monkeypatch): def test_resources_manager_config_explicite(tmp_path): # act - mgr = OcrdResourceManager(xdg_config_home=str(tmp_path)) + mgr = OcrdResourceManager(xdg_config_home=str(tmp_path / 'config'), xdg_data_home=str(tmp_path / 'data')) # assert - f = tmp_path / 'ocrd' / CONST_RESOURCE_YML + f = tmp_path / 'config' / 'ocrd' / CONST_RESOURCE_YML assert f.exists() assert f == mgr.user_list assert mgr.add_to_user_database('ocrd-foo', f) - mgr.list_installed() + mgr.list_installed(executable='ocrd-foo') proc = 'ocrd-tesserocr-recognize' fpath = mgr.download(proc, CONST_RESOURCE_URL_LAYOUT, mgr.location_to_resource_dir('data')) assert fpath.exists() From 5b995e29d152b0c108e7bfc5114e2ef74a008ec1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Dec 2022 18:45:21 +0100 Subject: [PATCH 4/5] test resgmgr download --overwrite for directory --- tests/cli/test_resmgr.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/cli/test_resmgr.py b/tests/cli/test_resmgr.py index 2d746580b0..6cec6225b8 100644 --- a/tests/cli/test_resmgr.py +++ b/tests/cli/test_resmgr.py @@ -75,3 +75,18 @@ def test_directory_copy(mgr_with_tmp_path): assert not r.exception assert Path(mgr_path / 'ocrd-resources' / proc).exists() assert directory_size(mgr_path / 'ocrd-resources' / proc / res_name) == 30 + + r = runner.invoke( + resmgr_cli, + ['download', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], + env=env, + catch_exceptions=False + ) + assert 'already exists but --overwrite is not set' in r.output + r = runner.invoke( + resmgr_cli, + ['download', '--overwrite', '--allow-uninstalled', '--any-url', tmp_path, proc, res_name], + env=env, + catch_exceptions=False + ) + assert 'already exists but --overwrite is not set' not in r.output From 9a89127e3b117b9e2e9453331a6a62e905d281db Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 12 Dec 2022 18:49:42 +0100 Subject: [PATCH 5/5] fix regression from 2fb4bbabd --- ocrd/ocrd/resource_manager.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ocrd/ocrd/resource_manager.py b/ocrd/ocrd/resource_manager.py index 2fb4bbabd8..6ac6d26db4 100644 --- a/ocrd/ocrd/resource_manager.py +++ b/ocrd/ocrd/resource_manager.py @@ -1,11 +1,10 @@ from pathlib import Path from os.path import join from os import environ, listdir, getcwd, path, unlink -from shutil import copytree as copytree_, rmtree +from shutil import copytree, rmtree from json import loads from os import environ, listdir, getcwd, path from fnmatch import filter as apply_glob -from shutil import copytree from datetime import datetime from tarfile import open as open_tarfile from urllib.parse import urlparse, unquote @@ -234,7 +233,7 @@ def parameter_usage(self, name, usage='as-is'): return Path(name).stem raise ValueError("No such usage '%s'" % usage) - def _download_impl(self, url, filename, progress_cb=None, size=None, overwrite=False): + def _download_impl(self, url, filename, progress_cb=None, size=None): log = getLogger('ocrd.resource_manager._download_impl') log.info("Downloading %s to %s" % (url, filename)) with open(filename, 'wb') as f: @@ -244,7 +243,7 @@ def _download_impl(self, url, filename, progress_cb=None, size=None, overwrite=F progress_cb(len(data)) f.write(data) - def _copy_impl(self, src_filename, filename, progress_cb=None, overwrite=False): + def _copy_impl(self, src_filename, filename, progress_cb=None): log = getLogger('ocrd.resource_manager._copy_impl') log.info("Copying %s to %s", src_filename, filename) if Path(src_filename).is_dir(): @@ -307,22 +306,22 @@ def download( destdir.mkdir(parents=True, exist_ok=True) if resource_type in ('file', 'directory'): if is_url: - self._download_impl(url, fpath, progress_cb, overwrite=overwrite) + self._download_impl(url, fpath, progress_cb) else: - self._copy_impl(url, fpath, progress_cb, overwrite=overwrite) + self._copy_impl(url, fpath, progress_cb) elif resource_type == 'archive': with pushd_popd(tempdir=True) as tempdir: if is_url: - self._download_impl(url, 'download.tar.xx', progress_cb, overwrite=overwrite) + self._download_impl(url, 'download.tar.xx', progress_cb) else: - self._copy_impl(url, 'download.tar.xx', progress_cb, overwrite=overwrite) + self._copy_impl(url, 'download.tar.xx', progress_cb) Path('out').mkdir() with pushd_popd('out'): log.info("Extracting archive to %s/out" % tempdir) with open_tarfile('../download.tar.xx', 'r:*') as tar: tar.extractall() log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath)) - copytree(path_in_archive, str(fpath), overwrite=overwrite) + copytree(path_in_archive, str(fpath)) return fpath def _dedup_database(self, database=None, dedup_key='name'):