Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion repo/spec
94 changes: 51 additions & 43 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,9 +554,9 @@
self._base_logger.debug("stopped executor %s after %d tasks", str(executor), len(tasks) if tasks else -1)
if max_workers > 1:
# can cause deadlock:
#log_listener.stop()

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E265

block comment should start with '# '

Check failure on line 557 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E265

block comment should start with '# '
# not much better:
#log_listener.enqueue_sentinel()

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E265

block comment should start with '# '

Check failure on line 559 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E265

block comment should start with '# '
pass

except NotImplementedError:
Expand Down Expand Up @@ -639,7 +639,7 @@
self._base_logger.error(repr(e))
self._base_logger.warning(f"failed downloading file {input_file} for page {page_id}")
# process page
#executor.submit(self.process_page_file, *input_files)

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E265

block comment should start with '# '

Check failure on line 642 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E265

block comment should start with '# '
return executor.submit(_page_worker, max_seconds, *input_files), page_id, input_files

def process_workspace_handle_tasks(self, tasks: Dict[TFuture, Tuple[str, List[Optional[OcrdFileType]]]]) -> Tuple[
Expand Down Expand Up @@ -824,51 +824,59 @@
if not any(input_pcgts):
self._base_logger.warning(f'skipping page {page_id}')
return
output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
if input_files[input_pos].fileGrp == self.output_file_grp:
# input=output fileGrp: re-use ID exactly
output_file_id = input_files[input_pos].ID
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
# short-cut avoiding useless computation:
raise FileExistsError(
f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
)
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
for image_result in result.images:
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
if isinstance(image_result.alternative_image, PageType):
# special case: not an alternative image, but replacing the original image
# (this is needed by certain processors when the original's coordinate system
# cannot or must not be kept)
image_result.alternative_image.set_imageFilename(image_file_path)
image_result.alternative_image.set_imageWidth(image_result.pil.width)
image_result.alternative_image.set_imageHeight(image_result.pil.height)
elif isinstance(image_result.alternative_image, AlternativeImageType):
image_result.alternative_image.set_filename(image_file_path)
elif image_result.alternative_image is None:
pass # do not reference in PAGE result
else:
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
f"{type(image_result.alternative_image)}")
self.workspace.save_image_file(
image_result.pil,
image_file_id,
self.output_file_grp,
output_file_grps = self.output_file_grp.split(',')
output_file_ids = [make_file_id(input_files[input_pos], output_file_grp)
if input_files[input_pos].fileGrp != output_file_grp else
# input=output fileGrp: re-use ID exactly
input_files[input_pos].ID
for output_file_grp in output_file_grps]
if config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
for output_file_id in output_file_ids:
if output_file := next(self.workspace.mets.find_files(ID=output_file_id), None):
# short-cut avoiding useless computation:
raise FileExistsError(
f"A file with ID=={output_file_id} already exists {output_file}"
" and OCRD_EXISTING_OUTPUT != OVERWRITE"
)
results = self.process_page_pcgts(*input_pcgts, page_id=page_id)
if len(results) > len(output_file_grps):
self._base_logger.error(f"processor returned {len(results) - len(output_file_grps)} "
f"more results than specified output fileGrps for page {page_id}")
for result, output_file_id, output_file_grp in zip(results, output_file_ids, output_file_grps):
for image_result in result.images:
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
image_file_path = join(output_file_grp, f'{image_file_id}.png')
if isinstance(image_result.alternative_image, PageType):
# special case: not an alternative image, but replacing the original image
# (this is needed by certain processors when the original's coordinate system
# cannot or must not be kept, e.g. dewarping)
image_result.alternative_image.set_imageFilename(image_file_path)
image_result.alternative_image.set_imageWidth(image_result.pil.width)
image_result.alternative_image.set_imageHeight(image_result.pil.height)
elif isinstance(image_result.alternative_image, AlternativeImageType):
image_result.alternative_image.set_filename(image_file_path)
elif image_result.alternative_image is None:
pass # do not reference in PAGE result
else:
raise ValueError(f"process_page_pcgts returned an OcrdPageResultImage of unknown type "
f"{type(image_result.alternative_image)}")
self.workspace.save_image_file(
image_result.pil,
image_file_id,
output_file_grp,
page_id=page_id,
file_path=image_file_path,
)
result.pcgts.set_pcGtsId(output_file_id)
self.add_metadata(result.pcgts)
self.workspace.add_file(
file_id=output_file_id,
file_grp=output_file_grp,
page_id=page_id,
file_path=image_file_path,
local_filename=os.path.join(output_file_grp, output_file_id + '.xml'),
mimetype=MIMETYPE_PAGE,
content=to_xml(result.pcgts),
)
result.pcgts.set_pcGtsId(output_file_id)
self.add_metadata(result.pcgts)
self.workspace.add_file(
file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
mimetype=MIMETYPE_PAGE,
content=to_xml(result.pcgts),
)

def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
"""
Expand Down Expand Up @@ -914,7 +922,7 @@
value=self.version),
LabelType(type_='ocrd/core',
value=OCRD_VERSION)])
])

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E124

closing bracket does not match visual indentation

Check failure on line 925 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E124

closing bracket does not match visual indentation
metadata_obj.add_MetadataItem(metadata_item)

def resolve_resource(self, val):
Expand Down Expand Up @@ -1183,7 +1191,7 @@
if hasattr(file, 'pageId')), "")
pool = ThreadPool(processes=1)
try:
#_page_worker_processor.process_page_file(*input_files)

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.11, macos-latest)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.12, macos-latest)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

E265

block comment should start with '# '

Check failure on line 1194 in src/ocrd/processor/base.py

View workflow job for this annotation

GitHub Actions / build (3.8, macos-latest)

E265

block comment should start with '# '
async_result = pool.apply_async(_page_worker_processor.process_page_file, input_files)
async_result.get(timeout or None)
_page_worker_processor.logger.debug("page worker completed for page %s", page_id)
Expand Down
74 changes: 74 additions & 0 deletions src/ocrd/processor/ocrd_page_result.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dataclasses import dataclass, field
import copy
from typing import List, Union, Optional
from ocrd_models.ocrd_page import OcrdPage
from PIL.Image import Image
Expand All @@ -8,12 +9,85 @@

@dataclass
class OcrdPageResultImage():
"""
Encapsulates a single ``AlternativeImage`` reference to be persisted
as image file to the :py:class:`ocrd.Workspace`.
"""
pil: Image
"""
image data to be saved
"""
file_id_suffix: str
"""
a suffix to append to the file name when saving
(something like ``.IMG`` according to OCR-D
conventions for PAGE-XML)
"""
alternative_image: Optional[Union[AlternativeImageType, PageType]]
"""
the ``AlternativeImage`` instance that references this image;
to be amended with the actual (final) ``@filename`` when saving

alternatively, can be a ``Page`` instance: in that case,
amend its ``@imageFilename`` (i.e. replace the original image
of the PAGE-XML)
"""


@dataclass
class OcrdPageResult():
"""
Encapsulates the return type of :py:func:`ocrd.Processor.process_page_pcgts`,
i.e. an instance of :py:class:`ocrd_models.ocrd_page.OcrdPage` and an
accompanying list of :py:class:`OcrdPageResultImage` that contain all
image files referenced via ``AlternativeImage`` to be persisted into the
:py:class:`ocrd.Workspace` along with the PAGE-XML itself.
"""
pcgts: OcrdPage
images: List[OcrdPageResultImage] = field(default_factory=list)

class OcrdPageResultVariadicListWrapper():
"""
Proxy object for :py:class:`ocrd.SingleOcrdPageResult` allowing
list semantics (i.e. multi-valued return from
:py:func:`ocrd.Processor.process_page_pcgts`) without changing
the API introduced in version 3.0.

Everything but list access will yield the old (singular valued)
semantics.
"""
def __init__(
self,
pcgts: OcrdPage,
*args):
self._results = [SingleOcrdPageResult(pcgts)] + [
SingleOcrdPageResult(arg) for arg in args]

def __getitem__(self, key):
return self._results[key]

def __contains__(self, key):
return key in self._results

def __len__(self):
return len(self._results)

def __iter__(self):
return iter(self._results)

def __repr__(self):
return repr(self._results)

# allow copy() without infinite recursion
def __copy__(self):
return OcrdPageResultVariadicListWrapper(*copy.copy(self._results))

# allow deepcopy() without infinite recursion
def __deepcopy__(self, memo):
return OcrdPageResultVariadicListWrapper(*copy.deepcopy(self._results))

# delegate to all members of first result
def __getattr__(self, name):
return getattr(self._results[0], name)

SingleOcrdPageResult, OcrdPageResult = OcrdPageResult, OcrdPageResultVariadicListWrapper
20 changes: 20 additions & 0 deletions src/ocrd_models/ocrd_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
API to PAGE-XML, generated with generateDS from XML schema.
"""
from io import StringIO
import copy
from typing import Dict, Union, Any
from lxml import etree as ET
from elementpath import XPath2Parser, XPathContext
Expand Down Expand Up @@ -212,6 +213,25 @@ def __init__(
self.xpath_context = XPathContext(self.etree)
self.xpath = lambda expression: self.xpath_parser.parse(expression).get_results(self.xpath_context)

# allow copy() without infinite recursion
def __copy__(self):
return OcrdPage(
copy.copy(self._pcgts),
copy.copy(self.etree),
copy.copy(self.mapping),
copy.copy(self.revmap),
)

# allow deepcopy() without infinite recursion
def __deepcopy__(self, memo):
return OcrdPage(
copy.deepcopy(self._pcgts, memo),
copy.deepcopy(self.etree, memo),
copy.deepcopy(self.mapping, memo),
copy.deepcopy(self.revmap, memo),
)

# delegate to all members of ._pcgts
def __getattr__(self, name):
return getattr(self._pcgts, name)

Expand Down
Loading
Loading