Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 82 additions & 38 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
from os.path import relpath, exists, join, isabs, dirname, basename, abspath
from pathlib import Path
import sys
from itertools import product
from glob import glob # XXX pathlib.Path.glob does not support absolute globs
import re

import click

from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
from ocrd.cli import command_with_replaced_help
from ocrd.decorators import ocrd_mets_filter_options
from ocrd_models import OcrdMetsFilter
from ocrd_utils import getLogger, pushd_popd, EXT_TO_MIME
from . import command_with_replaced_help

log = getLogger('ocrd.cli.workspace')

Expand Down Expand Up @@ -56,6 +59,7 @@ def workspace_cli(ctx, directory, mets, mets_basename, backup):
"""
Working with workspace
"""
ctx.max_content_width = 120
ctx.obj = WorkspaceCtx(directory, mets_url=mets, mets_basename=mets_basename, automatic_backup=backup)

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -106,11 +110,12 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency
(r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
@ocrd_mets_filter_options()
@click.argument('mets_url')
# XXX deprecated
@click.argument('workspace_dir', default=None, required=False)
@pass_workspace
def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir, **filter_args):
"""
Create a workspace from METS_URL and return the directory

Expand All @@ -127,7 +132,7 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir):
dst_dir=os.path.abspath(ctx.directory),
mets_basename=basename(ctx.mets_url),
clobber_mets=clobber_mets,
download=download,
download=filter_args if download else False,
)
workspace.save_mets()
print(workspace.directory)
Expand Down Expand Up @@ -164,10 +169,28 @@ def workspace_init(ctx, clobber_mets, directory):
# ----------------------------------------------------------------------

@workspace_cli.command('add')
@click.option('-G', '--file-grp', help="fileGrp USE", required=True)
@click.option('-i', '--file-id', help="ID for the file", required=True)
@click.option('-m', '--mimetype', help="Media type of the file", required=True)
@click.option('-g', '--page-id', help="ID of the physical page")
@ocrd_mets_filter_options(
operators=['in'],
fields=['ID', 'mimetype', 'fileGrp', 'pageId'],
help_field=dict(
fileGrp='fileGrp USE',
mimetype='Media type',
pageId='Page ID of the physical page',
ID='ID'),
metavar='VAL',
required=dict(
fileGrp_include=True,
ID_include=True,
mimetype_include=True,
),
help_operation='of the file to add',
help_type='(string)',
parameter=dict(
fileGrp_include='file_grp',
ID_include='file_id',
mimetype_include='mimetype',
pageId_include='page_id',
))
@click.option('-C', '--check-file-exists', help="Whether to ensure FNAME exists", is_flag=True, default=False)
@click.option('--ignore', help="Do not check whether file exists.", default=False, is_flag=True)
@click.option('--force', help="If file with ID already exists, replace it. No effect if --ignore is set.", default=False, is_flag=True)
Expand Down Expand Up @@ -213,11 +236,26 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
# pylint: disable=bad-whitespace, broad-except
@workspace_cli.command('bulk-add')
@click.option('-r', '--regex', help="Regular expression matching the FILE_GLOB filesystem paths to define named captures usable in the other parameters", required=True)
@click.option('-m', '--mimetype', help="Media type of the file. If not provided, guess from filename", required=False)
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
@click.option('-i', '--file-id', help="ID of the file", required=True)
@ocrd_mets_filter_options(
operators=['in'],
help_field=dict(
fileGrp='File group USE of the files',
mimetype='Media type of the files (if not provided, guess from filename)',
pageId='physical page ID of the files',
ID='ID of the file'),
metavar='VAL',
help_operation='to add',
help_type='(string/regex)',
required=dict(
fileGrp_include=True,
ID_include=True),
parameter=dict(
fileGrp_include='file_grp',
ID_include='file_id',
mimetype_include='mimetype',
pageId_include='page_id',
url_include='url'))
@click.option('-u', '--url', help="local filesystem path in the workspace directory (copied from source file if different)", required=True)
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview", default=False, is_flag=True)
@click.option('-I', '--ignore', help="Disable checking for existing file entries (faster)", default=False, is_flag=True)
@click.option('-f', '--force', help="Replace existing file entries with the same ID (no effect when --ignore is set, too)", default=False, is_flag=True)
Expand Down Expand Up @@ -281,10 +319,9 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path))

# expand templates
for param_name in file_dict:
for group_name in group_dict:
file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])

for field, (found_field, found_val) in product(file_dict.keys(), group_dict.items()):
file_dict[field] = file_dict[field].replace('{{ %s }}' % found_field,
found_val if not isinstance(found_val, list) else '-'.join(found_val))
# copy files
if file_dict['url']:
urlpath = Path(workspace.directory, file_dict['url'])
Expand All @@ -311,11 +348,15 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
# ----------------------------------------------------------------------

@workspace_cli.command('find')
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
# pylint: disable=bad-continuation
@ocrd_mets_filter_options(
help_field=dict(
fileGrp='fileGrp USE',
mimetype='Media type',
pageId='Page ID',
ID='ID'),
metavar='FILTER',
help_operation='of files to find',
help_type='(string)')
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['url'],
multiple=True,
Expand All @@ -331,7 +372,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@pass_workspace
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
def workspace_find(ctx, output_field, download, **filter_args):
"""
Find files.

Expand All @@ -341,12 +382,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
modified_mets = False
ret = list()
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
mimetype=mimetype,
pageId=page_id,
):
for f in OcrdMetsFilter(**filter_args).find_files(workspace):
if download and not f.local_filename:
workspace.download_file(f)
modified_mets = True
Expand Down Expand Up @@ -412,12 +448,25 @@ def remove_group(ctx, group, recursive, force, keep_files):
# ----------------------------------------------------------------------

@workspace_cli.command('prune-files')
@click.option('-G', '--file-grp', help="fileGrp USE", metavar='FILTER')
@click.option('-m', '--mimetype', help="Media type to look for", metavar='FILTER')
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
@click.option('-i', '--file-id', help="ID", metavar='FILTER')
@ocrd_mets_filter_options(
help_field=dict(
fileGrp='fileGrp USE',
mimetype='Media type',
pageId='ID of physical page',
ID='ID'),
metavar='FILTER',
help_operation=dict(
include='of files to prune',
excluded='of files NOT to prune',
),
parameter=dict(
fileGrp_include='file_grp',
ID_include='file_id',
mimetype_include='mimetype',
pageId_include='page_id',
url_include='url'))
@pass_workspace
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
def prune_files(ctx, **filter_args):
"""
Removes mets:files that point to non-existing local files

Expand All @@ -426,12 +475,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
with pushd_popd(workspace.directory):
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
mimetype=mimetype,
pageId=page_id,
):
for f in OcrdMetsFilter(**filter_args).find_files(workspace):
try:
if not f.local_filename or not exists(f.local_filename):
workspace.mets.remove_file(f.ID)
Expand Down
104 changes: 101 additions & 3 deletions ocrd/ocrd/decorators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from os.path import isfile
from re import match, sub, IGNORECASE
from itertools import product
import sys
from string import Template

import click

Expand All @@ -15,6 +18,7 @@
from .resolver import Resolver
from .processor.base import run_processor
from ocrd_validators import WorkspaceValidator
from ocrd_models.ocrd_mets_filter import FIELDS

def _set_root_logger_version(ctx, param, value): # pylint: disable=unused-argument
setOverrideLogLevel(value)
Expand Down Expand Up @@ -114,11 +118,11 @@ def ocrd_cli_options(f):

Usage::

import ocrd_click_cli from ocrd.utils
import ocrd_cli_options from ocrd.utils

@click.command()
@ocrd_click_cli
def cli(mets_url):
@ocrd_cli_options
def cli(mets_url, **kwargs):
print(mets_url)
"""
params = [
Expand All @@ -141,3 +145,97 @@ def cli(mets_url):
for param in params:
param(f)
return f

TEMPLATE_DEFAULTS = {
'metavar': 'PAT',
'required': False,
'parameter': '${field}_${operator}clude',
'help': '${field} ${operation} ${type}',
'help_field': '${field}',
'help_operation': 'to ${operator}clude',
'help_type': '(string/regex/comma-separated)',
}
class ocrd_mets_filter_options():
"""
Adds include/exclude filter options
"""

def __init__(self, fields=FIELDS, operators=None, **templates):
self.fields = fields
self.operators = operators if operators else ['ex', 'in']
templates={**TEMPLATE_DEFAULTS, **templates}
self.templates = {}
for (tpl_name, tpl), field, operator in product(templates.items(), self.fields, self.operators):
if tpl_name not in self.templates:
self.templates[tpl_name] = dict()
key = field
if tpl_name in ['help_operation']:
key = '%sclude' % operator
elif tpl_name in ['parameter', 'required']:
key = '%s_%sclude' % (field, operator)
if key not in self.templates[tpl_name]:
if isinstance(tpl, dict):
self.templates[tpl_name][key] = Template(str(tpl[key] if key in tpl else TEMPLATE_DEFAULTS[tpl_name]))
else:
self.templates[tpl_name][key] = Template(str(tpl if tpl else TEMPLATE_DEFAULTS[tpl_name]))

def _expand_template(self, tpl_name, field, operator, tpl_vars):
tpl = self.templates[tpl_name]
if tpl_name in ['help_operation']:
return tpl['%sclude' % operator].safe_substitute(tpl_vars)
if tpl_name in ['parameter']:
return tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars)
if tpl_name in ['required']:
return 'True' == tpl['%s_%sclude' % (field, operator)].safe_substitute(tpl_vars)
return tpl[field].safe_substitute(tpl_vars)

def __call__(self, f):
for field, operator in product(self.fields, self.operators):
_tpl = lambda tpl_name: lambda **tpl_vars_: self._expand_template(tpl_name, field,
operator, tpl_vars={**{'field': field, 'operator': operator}, **tpl_vars_})

# XXX Controls the kwarg name of this field in the decorated command
args = [_tpl('parameter')()]
kwargs = dict(
default=None,
callback=lambda ctx, param, value: value.split(',') if value and ',' in value else value,
required=_tpl('required')(),
metavar=_tpl('metavar')(),
help=_tpl('help')(
field=_tpl('help_field')(),
operation=_tpl('help_operation')(),
type=_tpl('help_type')()
))

# XXX No regex search for pageId search currently
if field == 'pageId' and operator == 'in':
kwargs['help'] = sub(r'[,/]?\s*regexp?\b', '', kwargs['help'], flags=IGNORECASE)

# pylint: disable=multiple-statements
# XXX must be retained for backwards-compatibility
if operator == 'in':
if field == 'ID': args.extend(['-i', '--file-id'])
if field == 'pageId': args.extend(['-g', '--page-id'])
if field == 'fileGrp': args.extend(['-G', '--file-grp'])
if field == 'mimetype': args.extend(['-m', '--mimetype'])

# # 0
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field))
# if field.lower() != field:
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower()))

# 2
args.append('--%s%s' % ('not-' if operator == 'ex' else '', field.lower()))

# 3
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field))

# 4
# if operator == 'in':
# args.append('--%s' % field.lower())
# else:
# args.append('--%s%s' % ('not-' if operator == 'ex' else '', field))

click.option(*args, **kwargs)(f)
# print({k: v.safe_substitute({}) for k, v in self.templates['required'].items()})
return f
10 changes: 7 additions & 3 deletions ocrd/ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
nth_url_segment
)
from ocrd.workspace import Workspace
from ocrd_models import OcrdMets
from ocrd_models import OcrdMets, OcrdMetsFilter

log = getLogger('ocrd.resolver')

Expand Down Expand Up @@ -155,8 +155,12 @@ def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_ba

workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)

if download:
for f in workspace.mets.find_files():
# XXX an empty dict is false-y but valid in this context
if download or download == {}:
if not isinstance(download, dict):
download = {}
mets_filter = OcrdMetsFilter(**download)
for f in mets_filter.find_files(workspace):
workspace.download_file(f)

return workspace
Expand Down
1 change: 1 addition & 0 deletions ocrd_models/ocrd_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from .ocrd_mets import OcrdMets
from .ocrd_xml_base import OcrdXmlDocument
from .report import ValidationReport
from .ocrd_mets_filter import OcrdMetsFilter
Loading