From f299c2a6932b2c91bd6924de7468d13425f30612 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 16:50:43 +0100 Subject: [PATCH 01/13] Processor builder outside the run_processor + cached version --- ocrd/ocrd/processor/helpers.py | 82 ++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index e07333eaa2..3ebfe4fb5d 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -3,6 +3,8 @@ """ from os import environ from time import perf_counter, process_time +from functools import lru_cache, wraps +from frozendict import frozendict import json import inspect from subprocess import run, PIPE @@ -42,6 +44,7 @@ def run_processor( parameter=None, parameter_override=None, working_dir=None, + cached_processor=False ): # pylint: disable=too-many-locals """ Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace. @@ -73,14 +76,17 @@ def run_processor( ) log = getLogger('ocrd.processor.helpers.run_processor') log.debug("Running processor %s", processorClass) - processor = processorClass( - workspace, - ocrd_tool=ocrd_tool, + + processor = get_processor( + processor_class=processorClass, + parameter=parameter, + workspace=workspace, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, - parameter=parameter + cached_processor=cached_processor ) + ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) otherrole = ocrd_tool['steps'][0] @@ -263,3 +269,71 @@ def wrap(s): ocrd_tool.get('input_file_grp', 'NONE'), ocrd_tool.get('output_file_grp', 'NONE') ) + + +# Taken from https://github.com/OCR-D/core/pull/884 +def freeze_args(func): + """ + Transform mutable dictionary into immutable. Useful to be compatible with cache. + Code taken from `this post `_ + """ + @wraps(func) + def wrapped(*args, **kwargs): + args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) + kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} + return func(*args, **kwargs) + return wrapped + + +# Taken from https://github.com/OCR-D/core/pull/884 +# TODO: Decide how much maxsize is optimal as a default +@freeze_args +@lru_cache(maxsize=32) +def get_cached_processor(parameter: dict, processor_class=None): + """ + Call this function to get back an instance of a processor. + The results are cached based on the parameters. + Args: + parameter (dict): a dictionary of parameters. + processor_class: the concrete `:py:class:~ocrd.Processor` class. + Returns: + When the concrete class of the processor is unknown, `None` is returned. + Otherwise, an instance of the `:py:class:~ocrd.Processor` is returned. + """ + if processor_class: + dict_params = dict(parameter) if parameter else None + return processor_class(workspace=None, parameter=dict_params) + return None + + +def get_processor( + processor_class, + parameter: dict, + workspace=None, + ocrd_tool=None, + page_id=None, + input_file_grp=None, + output_file_grp=None, + cached_processor: bool = False, +): + if processor_class: + if cached_processor: + cached_processor = get_cached_processor( + parameter=parameter, + processor_class=processor_class + ) + cached_processor.workspace = workspace + cached_processor.page_id = page_id + cached_processor.input_file_grp = input_file_grp + cached_processor.output_file_grp = output_file_grp + return cached_processor + else: + return processor_class( + workspace, + ocrd_tool=ocrd_tool, + page_id=page_id, + input_file_grp=input_file_grp, + output_file_grp=output_file_grp, + parameter=parameter + ) + raise ValueError("Processor class is not known") From 79ff39da41931e49222f142c9cb88d5489a01b0e Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 16:58:51 +0100 Subject: [PATCH 02/13] Add frozendict to requirements --- ocrd/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index afa64a4c30..e3c155c733 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -11,3 +11,4 @@ Deprecated == 1.2.0 memory-profiler >= 0.58.0 sparklines >= 0.4.2 python-magic +frozendict>=2.3.4 From eb8dee53958f8b8f191117e27a8a29a31b875f6d Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 17:25:27 +0100 Subject: [PATCH 03/13] include ocrd_tool in params --- ocrd/ocrd/processor/helpers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 3ebfe4fb5d..644c6de6b6 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -81,6 +81,7 @@ def run_processor( processor_class=processorClass, parameter=parameter, workspace=workspace, + ocrd_tool=ocrd_tool, page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, From 152566e47673f26e16388a645f71ab5e75fb4750 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 17:51:53 +0100 Subject: [PATCH 04/13] Move freeze_args to ocrd.utils.introspect --- ocrd/ocrd/processor/helpers.py | 20 +++----------------- ocrd/requirements.txt | 1 - ocrd_utils/ocrd_utils/__init__.py | 4 +++- ocrd_utils/ocrd_utils/introspect.py | 17 +++++++++++++++++ ocrd_utils/requirements.txt | 1 + 5 files changed, 24 insertions(+), 19 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 644c6de6b6..15f8f95c1d 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -3,8 +3,7 @@ """ from os import environ from time import perf_counter, process_time -from functools import lru_cache, wraps -from frozendict import frozendict +from functools import lru_cache import json import inspect from subprocess import run, PIPE @@ -12,7 +11,8 @@ from sparklines import sparklines from click import wrap_text -from ocrd_utils import getLogger +from ocrd_utils import getLogger, freeze_args + __all__ = [ 'generate_processor_help', @@ -272,20 +272,6 @@ def wrap(s): ) -# Taken from https://github.com/OCR-D/core/pull/884 -def freeze_args(func): - """ - Transform mutable dictionary into immutable. Useful to be compatible with cache. - Code taken from `this post `_ - """ - @wraps(func) - def wrapped(*args, **kwargs): - args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) - kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} - return func(*args, **kwargs) - return wrapped - - # Taken from https://github.com/OCR-D/core/pull/884 # TODO: Decide how much maxsize is optimal as a default @freeze_args diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index e3c155c733..afa64a4c30 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -11,4 +11,3 @@ Deprecated == 1.2.0 memory-profiler >= 0.58.0 sparklines >= 0.4.2 python-magic -frozendict>=2.3.4 diff --git a/ocrd_utils/ocrd_utils/__init__.py b/ocrd_utils/ocrd_utils/__init__.py index c0f31e1b00..1eb92cff9f 100644 --- a/ocrd_utils/ocrd_utils/__init__.py +++ b/ocrd_utils/ocrd_utils/__init__.py @@ -151,8 +151,10 @@ xywh_from_polygon) from .introspect import ( + freeze_args, set_json_key_value_overrides, - membername) + membername +) from .logging import ( disableLogging, diff --git a/ocrd_utils/ocrd_utils/introspect.py b/ocrd_utils/ocrd_utils/introspect.py index ab33320650..cfd3d32b52 100644 --- a/ocrd_utils/ocrd_utils/introspect.py +++ b/ocrd_utils/ocrd_utils/introspect.py @@ -2,6 +2,23 @@ Utility functions to simplify access to data structures. """ import json +from functools import wraps +from frozendict import frozendict + + +# Taken from https://github.com/OCR-D/core/pull/884 +def freeze_args(func): + """ + Transform mutable dictionary into immutable. Useful to be compatible with cache. + Code taken from `this post `_ + """ + @wraps(func) + def wrapped(*args, **kwargs): + args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args]) + kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()} + return func(*args, **kwargs) + return wrapped + def membername(class_, val): """Convert a member variable/constant into a member name string.""" diff --git a/ocrd_utils/requirements.txt b/ocrd_utils/requirements.txt index de4e7adee3..b1771cb20b 100644 --- a/ocrd_utils/requirements.txt +++ b/ocrd_utils/requirements.txt @@ -5,3 +5,4 @@ numpy atomicwrites >= 1.3.0 importlib_metadata;python_version<'3.8' importlib_resources;python_version<'3.8' +frozendict>=2.3.4 From a0fa60c097fba21028e3417c19ddb562afe37b58 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 18:52:13 +0100 Subject: [PATCH 05/13] Add typing to get_processor and get_cached_processor --- ocrd/ocrd/processor/helpers.py | 39 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 15f8f95c1d..c0d6d75f55 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -7,11 +7,15 @@ import json import inspect from subprocess import run, PIPE +from typing import List, Type + from memory_profiler import memory_usage from sparklines import sparklines from click import wrap_text -from ocrd_utils import getLogger, freeze_args +from ocrd import Workspace +from ocrd.processor import Processor +from ocrd_utils import freeze_args, getLogger __all__ = [ @@ -20,6 +24,7 @@ 'run_processor' ] + def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None): if workspace is None: if resolver is None: @@ -78,6 +83,7 @@ def run_processor( log.debug("Running processor %s", processorClass) processor = get_processor( + # TODO: Warning: processorClass of type Object gets auto casted to Type[Processor] processor_class=processorClass, parameter=parameter, workspace=workspace, @@ -276,7 +282,7 @@ def wrap(s): # TODO: Decide how much maxsize is optimal as a default @freeze_args @lru_cache(maxsize=32) -def get_cached_processor(parameter: dict, processor_class=None): +def get_cached_processor(parameter: dict, processor_class: Type[Processor]): """ Call this function to get back an instance of a processor. The results are cached based on the parameters. @@ -294,13 +300,13 @@ def get_cached_processor(parameter: dict, processor_class=None): def get_processor( - processor_class, + processor_class: Type[Processor], parameter: dict, - workspace=None, - ocrd_tool=None, - page_id=None, - input_file_grp=None, - output_file_grp=None, + workspace: Workspace = None, + ocrd_tool: dict = None, + page_id: str = None, + input_file_grp: List[str] = None, + output_file_grp: List[str] = None, cached_processor: bool = False, ): if processor_class: @@ -314,13 +320,12 @@ def get_processor( cached_processor.input_file_grp = input_file_grp cached_processor.output_file_grp = output_file_grp return cached_processor - else: - return processor_class( - workspace, - ocrd_tool=ocrd_tool, - page_id=page_id, - input_file_grp=input_file_grp, - output_file_grp=output_file_grp, - parameter=parameter - ) + return processor_class( + workspace=workspace, + ocrd_tool=ocrd_tool, + page_id=page_id, + input_file_grp=input_file_grp, + output_file_grp=output_file_grp, + parameter=parameter + ) raise ValueError("Processor class is not known") From 3349328823e15c0bec2b597cc0fed67215defb2f Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Mon, 16 Jan 2023 19:36:40 +0100 Subject: [PATCH 06/13] Fix workspace import --- ocrd/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index c0d6d75f55..0bc7db7e58 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -13,7 +13,7 @@ from sparklines import sparklines from click import wrap_text -from ocrd import Workspace +from ocrd.workspace import Workspace from ocrd.processor import Processor from ocrd_utils import freeze_args, getLogger From f7f8278450eb7a893036d79fddca93c5637de824 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 17 Jan 2023 13:02:07 +0100 Subject: [PATCH 07/13] Remove processor typing Importing the Processor fails no matter from where it is imported. --- ocrd/ocrd/processor/helpers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 0bc7db7e58..d599796977 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -14,7 +14,6 @@ from click import wrap_text from ocrd.workspace import Workspace -from ocrd.processor import Processor from ocrd_utils import freeze_args, getLogger @@ -300,7 +299,7 @@ def get_cached_processor(parameter: dict, processor_class: Type[Processor]): def get_processor( - processor_class: Type[Processor], + processor_class, parameter: dict, workspace: Workspace = None, ocrd_tool: dict = None, From b8acdc888ed8f969f3bc262aa3cf901282c2a95e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 17 Jan 2023 13:06:12 +0100 Subject: [PATCH 08/13] import processor from base --- ocrd/ocrd/processor/helpers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index d599796977..9a74945ced 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -14,6 +14,7 @@ from click import wrap_text from ocrd.workspace import Workspace +from ocrd.processor.base import Processor from ocrd_utils import freeze_args, getLogger @@ -299,7 +300,7 @@ def get_cached_processor(parameter: dict, processor_class: Type[Processor]): def get_processor( - processor_class, + processor_class: Type[Processor], parameter: dict, workspace: Workspace = None, ocrd_tool: dict = None, From b4bdff356f8ce5e63383bc72b99d5640bba6a190 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 17 Jan 2023 13:10:31 +0100 Subject: [PATCH 09/13] remove processor typing --- ocrd/ocrd/processor/helpers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 9a74945ced..8c46ec04c9 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -14,7 +14,6 @@ from click import wrap_text from ocrd.workspace import Workspace -from ocrd.processor.base import Processor from ocrd_utils import freeze_args, getLogger @@ -282,7 +281,7 @@ def wrap(s): # TODO: Decide how much maxsize is optimal as a default @freeze_args @lru_cache(maxsize=32) -def get_cached_processor(parameter: dict, processor_class: Type[Processor]): +def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. The results are cached based on the parameters. @@ -300,7 +299,7 @@ def get_cached_processor(parameter: dict, processor_class: Type[Processor]): def get_processor( - processor_class: Type[Processor], + processor_class, parameter: dict, workspace: Workspace = None, ocrd_tool: dict = None, From 5d2a3cb76d1e5674b3d86045cc0c80f2889abc0b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 21 Jan 2023 18:04:51 +0100 Subject: [PATCH 10/13] Add warning regarding the cached_processor flag --- ocrd/ocrd/processor/helpers.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 8c46ec04c9..c2a7dafefe 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -48,7 +48,7 @@ def run_processor( parameter=None, parameter_override=None, working_dir=None, - cached_processor=False + cached_processor=False # TODO don't set this yet! ): # pylint: disable=too-many-locals """ Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace. @@ -65,6 +65,9 @@ def run_processor( - :py:attr:`output_file_grp` - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings) + Warning: Avoid setting the `cached_processor` flag to True. It may have unexpected side effects. + This flag is used for an experimental feature we would like to adopt in future. + Run the processor on the workspace (creating output files in the filesystem). Finally, write back the workspace (updating the METS in the filesystem). @@ -82,7 +85,6 @@ def run_processor( log.debug("Running processor %s", processorClass) processor = get_processor( - # TODO: Warning: processorClass of type Object gets auto casted to Type[Processor] processor_class=processorClass, parameter=parameter, workspace=workspace, @@ -279,8 +281,9 @@ def wrap(s): # Taken from https://github.com/OCR-D/core/pull/884 # TODO: Decide how much maxsize is optimal as a default +# TODO: The max size should be configurable with, e.g., with an environment variable? @freeze_args -@lru_cache(maxsize=32) +@lru_cache(maxsize=32) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. From d5fccda5435eb5433c8a99fcd9ebaa3c984959f4 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 14 Feb 2023 10:50:14 +0100 Subject: [PATCH 11/13] kwarg cached_processor -> instance_caching --- ocrd/ocrd/processor/helpers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index c2a7dafefe..03f8d6beec 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -48,7 +48,7 @@ def run_processor( parameter=None, parameter_override=None, working_dir=None, - cached_processor=False # TODO don't set this yet! + instance_caching=False # TODO don't set this yet! ): # pylint: disable=too-many-locals """ Instantiate a Pythonic processor, open a workspace, run the processor and save the workspace. @@ -92,7 +92,7 @@ def run_processor( page_id=page_id, input_file_grp=input_file_grp, output_file_grp=output_file_grp, - cached_processor=cached_processor + instance_caching=instance_caching ) ocrd_tool = processor.ocrd_tool @@ -309,10 +309,10 @@ def get_processor( page_id: str = None, input_file_grp: List[str] = None, output_file_grp: List[str] = None, - cached_processor: bool = False, + instance_caching: bool = False, ): if processor_class: - if cached_processor: + if instance_caching: cached_processor = get_cached_processor( parameter=parameter, processor_class=processor_class From 76a6b029ecb8f394867e3349a02ff77e7fb7e52b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 14 Feb 2023 10:52:01 +0100 Subject: [PATCH 12/13] Env for max processor cache, or 128 by default --- ocrd/ocrd/processor/helpers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 03f8d6beec..78db34e0cf 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -280,10 +280,8 @@ def wrap(s): # Taken from https://github.com/OCR-D/core/pull/884 -# TODO: Decide how much maxsize is optimal as a default -# TODO: The max size should be configurable with, e.g., with an environment variable? @freeze_args -@lru_cache(maxsize=32) +@lru_cache(maxsize=environ.get('OCRD_MAX_PROCESSOR_CACHE', 128)) def get_cached_processor(parameter: dict, processor_class): """ Call this function to get back an instance of a processor. From 80046204323f12b432090e1b3174ad7070c87504 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 14 Feb 2023 22:43:23 +0100 Subject: [PATCH 13/13] replace missed: cached_processor -> instance_caching Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 78db34e0cf..3450b51a0b 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -65,7 +65,7 @@ def run_processor( - :py:attr:`output_file_grp` - :py:attr:`parameter` (after applying any :py:attr:`parameter_override` settings) - Warning: Avoid setting the `cached_processor` flag to True. It may have unexpected side effects. + Warning: Avoid setting the `instance_caching` flag to True. It may have unexpected side effects. This flag is used for an experimental feature we would like to adopt in future. Run the processor on the workspace (creating output files in the filesystem).