Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@
from os import getcwd
from os.path import relpath, exists, join, isabs
from pathlib import Path
from json import loads
from json import loads, dumps
import sys
from glob import glob # XXX pathlib.Path.glob does not support absolute globs
import re
import time
import numpy as np

import click

from ocrd import Resolver, Workspace, WorkspaceValidator, WorkspaceBackupManager
from ocrd.mets_server import OcrdMetsServer
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file
from ocrd_utils import getLogger, initLogging, pushd_popd, EXT_TO_MIME, safe_filename, parse_json_string_or_file, partition_list
from ocrd.decorators import mets_find_options
from . import command_with_replaced_help

Expand Down Expand Up @@ -583,17 +584,35 @@ def list_groups(ctx):
print("\n".join(workspace.mets.file_groups))

# ----------------------------------------------------------------------
# ocrd workspace list-pages
# ocrd workspace list-page
# ----------------------------------------------------------------------

@workspace_cli.command('list-page')
@click.option('-f', '--output-format', help="Output format", type=click.Choice(['one-per-line', 'comma-separated', 'json']), default='one-per-line')
@click.option('-D', '--chunk-number', help="Partition the return value into n roughly equally sized chunks", default=1, type=int)
@click.option('-C', '--chunk-index', help="Output the nth chunk of results, -1 for all of them.", default=None, type=int)
@click.option('-r', '--page-id-range', help="Restrict the pages to those matching the provided range, based on the @ID attribute. Separate start/end with ..")
@click.option('-R', '--numeric-range', help="Restrict the pages to those in the range, in numerical document order. Separate start/end with ..")
@pass_workspace
def list_pages(ctx):
def list_pages(ctx, output_format, chunk_number, chunk_index, page_id_range, numeric_range):
"""
List physical page IDs
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.physical_pages))
find_kwargs = {}
if page_id_range:
find_kwargs['pageId'] = page_id_range
ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs)})
if numeric_range:
start, end = map(int, numeric_range.split('..'))
ids = ids[start-1:end]
chunks = partition_list(ids, chunk_number, chunk_index)
if output_format == 'one-per-line':
print("\n".join(["\n".join(chunk) for chunk in chunks]))
elif output_format == 'comma-separated':
print("\n".join([",".join(chunk) for chunk in chunks]))
elif output_format == 'json':
print(dumps(chunks))

# ----------------------------------------------------------------------
# ocrd workspace get-id
Expand Down
1 change: 1 addition & 0 deletions ocrd_utils/ocrd_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@
is_string,
make_file_id,
nth_url_segment,
partition_list,
parse_json_string_or_file,
parse_json_string_with_comments,
remove_non_path_from_url,
Expand Down
38 changes: 38 additions & 0 deletions ocrd_utils/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,28 @@
from .constants import REGEX_FILE_ID
from .deprecate import deprecation_warning
from warnings import warn
from math import ceil
import sys
from itertools import islice

if sys.version_info >= (3, 12):
from itertools import batched
else:
def batched(iterable, chunk_size):
iterator = iter(iterable)
chunk = None
while True:
chunk = tuple(islice(iterator, chunk_size))
if not chunk:
break
yield chunk

__all__ = [
'assert_file_grp_cardinality',
'concat_padded',
'get_local_filename',
'is_local_filename',
'partition_list',
'is_string',
'make_file_id',
'nth_url_segment',
Expand Down Expand Up @@ -207,3 +223,25 @@ def generate_range(start, end):
for i in range(int(start_num), int(end_num) + 1):
ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
return ret

def partition_list(lst, chunks, chunk_index=None):
"""
Partition a list into roughly equally-sized chunks

Args:
lst (list): list to partition
chunks (int): number of chunks to generate (not per chunk!)

Keyword Args:
chunk_index (None|int): If provided, return only a list consisting of this chunk

Returns:
list(list())
"""
if not lst:
return []
items_per_chunk = ceil(len(lst) / chunks)
ret = list(map(list, batched(lst, items_per_chunk)))
if chunk_index is not None:
return [ret[chunk_index]]
return ret
16 changes: 16 additions & 0 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,5 +553,21 @@ def test_bulk_add_stdin(self):
assert f.local_filename == Path('BIN/FILE_0001_BIN.IMG-wolf.png')
assert f.url == 'https://host/FILE_0001_BIN.IMG-wolf/BIN/FILE_0001_BIN.IMG-wolf.png'

def test_list_page(self):
def _call(args):
_, out, _ = self.invoke_cli(workspace_cli, ['list-page', *args])
return out.rstrip('\n')
with pushd_popd(Path(__file__).parent.parent / 'data/list-page-workspace'):
assert _call([]) == 'PHYS_0001\nPHYS_0002\nPHYS_0003\nPHYS_0004\nPHYS_0005\nPHYS_0006\nPHYS_0008\nPHYS_0009\nPHYS_0010\nPHYS_0011\nPHYS_0012\nPHYS_0013\nPHYS_0014\nPHYS_0015\nPHYS_0016\nPHYS_0017\nPHYS_0018\nPHYS_0019\nPHYS_0020\nPHYS_0022\nPHYS_0023\nPHYS_0024\nPHYS_0025\nPHYS_0026\nPHYS_0027\nPHYS_0028\nPHYS_0029'
assert _call(['-f', 'comma-separated']) == 'PHYS_0001,PHYS_0002,PHYS_0003,PHYS_0004,PHYS_0005,PHYS_0006,PHYS_0008,PHYS_0009,PHYS_0010,PHYS_0011,PHYS_0012,PHYS_0013,PHYS_0014,PHYS_0015,PHYS_0016,PHYS_0017,PHYS_0018,PHYS_0019,PHYS_0020,PHYS_0022,PHYS_0023,PHYS_0024,PHYS_0025,PHYS_0026,PHYS_0027,PHYS_0028,PHYS_0029'
assert _call(['-f', 'json']) == '[["PHYS_0001", "PHYS_0002", "PHYS_0003", "PHYS_0004", "PHYS_0005", "PHYS_0006", "PHYS_0008", "PHYS_0009", "PHYS_0010", "PHYS_0011", "PHYS_0012", "PHYS_0013", "PHYS_0014", "PHYS_0015", "PHYS_0016", "PHYS_0017", "PHYS_0018", "PHYS_0019", "PHYS_0020", "PHYS_0022", "PHYS_0023", "PHYS_0024", "PHYS_0025", "PHYS_0026", "PHYS_0027", "PHYS_0028", "PHYS_0029"]]'
assert _call(['-f', 'comma-separated', '-R', '5..5']) == 'PHYS_0005'
assert _call(['-f', 'comma-separated', '-R', '6..8']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0006..PHYS_0009']) == 'PHYS_0006,PHYS_0008,PHYS_0009'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3']) == 'PHYS_0001,PHYS_0002,PHYS_0003\nPHYS_0004,PHYS_0005,PHYS_0006\nPHYS_0008,PHYS_0009,PHYS_0010'
assert _call(['-f', 'comma-separated', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2']) == 'PHYS_0008,PHYS_0009,PHYS_0010'
from json import loads
assert loads(_call(['-f', 'json', '-r', 'PHYS_0001..PHYS_0010', '-D', '3', '-C', '2'])) == [['PHYS_0008', 'PHYS_0009', 'PHYS_0010']]

if __name__ == '__main__':
main(__file__)
188 changes: 188 additions & 0 deletions tests/data/list-page-workspace/mets.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/mets.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd">
<mets:metsHdr CREATEDATE="2023-11-20T19:35:02.939335">
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="CREATOR">
<mets:name>ocrd/core v2.58.1</mets:name>
</mets:agent>
</mets:metsHdr>
<mets:dmdSec ID="DMDLOG_0001">
<mets:mdWrap MDTYPE="MODS">
<mets:xmlData>
<mods:mods xmlns:mods="http://www.loc.gov/mods/v3">
</mods:mods>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:amdSec ID="AMD">
</mets:amdSec>
<mets:fileSec>
<mets:fileGrp USE="FOO">
<mets:file ID="FOO_1" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_2" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_3" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_4" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_5" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_6" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_8" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_9" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_10" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_11" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_12" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_13" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_14" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_15" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_16" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_17" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_18" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_19" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_20" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_22" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_23" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_24" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_25" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_26" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_27" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_28" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
<mets:file ID="FOO_29" MIMETYPE="foo/bar">
<mets:FLocat xlink:href="mets.xml" LOCTYPE="OTHER" OTHERLOCTYPE="FILE"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="PHYSICAL">
<mets:div TYPE="physSequence">
<mets:div TYPE="page" ID="PHYS_0001">
<mets:fptr FILEID="FOO_1"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0002">
<mets:fptr FILEID="FOO_2"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0003">
<mets:fptr FILEID="FOO_3"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0004">
<mets:fptr FILEID="FOO_4"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0005">
<mets:fptr FILEID="FOO_5"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0006">
<mets:fptr FILEID="FOO_6"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0008">
<mets:fptr FILEID="FOO_8"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0009">
<mets:fptr FILEID="FOO_9"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0010">
<mets:fptr FILEID="FOO_10"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0011">
<mets:fptr FILEID="FOO_11"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0012">
<mets:fptr FILEID="FOO_12"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0013">
<mets:fptr FILEID="FOO_13"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0014">
<mets:fptr FILEID="FOO_14"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0015">
<mets:fptr FILEID="FOO_15"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0016">
<mets:fptr FILEID="FOO_16"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0017">
<mets:fptr FILEID="FOO_17"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0018">
<mets:fptr FILEID="FOO_18"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0019">
<mets:fptr FILEID="FOO_19"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0020">
<mets:fptr FILEID="FOO_20"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0022">
<mets:fptr FILEID="FOO_22"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0023">
<mets:fptr FILEID="FOO_23"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0024">
<mets:fptr FILEID="FOO_24"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0025">
<mets:fptr FILEID="FOO_25"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0026">
<mets:fptr FILEID="FOO_26"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0027">
<mets:fptr FILEID="FOO_27"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0028">
<mets:fptr FILEID="FOO_28"/>
</mets:div>
<mets:div TYPE="page" ID="PHYS_0029">
<mets:fptr FILEID="FOO_29"/>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
Loading