From c81d31274a9fe01e5209d3350ef68db98a164cea Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 16:28:10 +0200 Subject: [PATCH 01/11] refactor logging constants --- ocrd_network/ocrd_network/deployer.py | 27 +++++------ ocrd_network/ocrd_network/logging.py | 48 +++++++++++++++++++ .../ocrd_network/processing_server.py | 8 ++++ .../ocrd_network/processing_worker.py | 33 ++++++------- ocrd_network/ocrd_network/processor_server.py | 5 +- ocrd_network/ocrd_network/server_cache.py | 11 ++++- 6 files changed, 93 insertions(+), 39 deletions(-) create mode 100644 ocrd_network/ocrd_network/logging.py diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 403bb94dcd..ebcd8f6fe5 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -9,7 +9,6 @@ from __future__ import annotations from typing import Dict, List, Union from re import search as re_search -from os import getpid from pathlib import Path import subprocess from time import sleep @@ -22,7 +21,7 @@ verify_mongodb_available, verify_rabbitmq_available, ) - +from .logging import get_mets_server_logging_file_path from .runtime_data import ( DataHost, DataMongoDB, @@ -468,18 +467,15 @@ def start_native_processor( self.log.info(f'Starting native processing worker: {processor_name}') channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') - cmd = f'{processor_name} worker --database {database_url} --queue {queue_url}' + cmd = f'{processor_name} worker --database {database_url} --queue {queue_url} &' # the only way (I could find) to make it work to start a process in the background and # return early is this construction. The pid of the last started background process is # printed with `echo $!` but it is printed inbetween other output. Because of that I added # `xyz` before and after the code to easily be able to filter out the pid via regex when # returning from the function - # TODO: Check here again - # log_path = f'/tmp/deployed_{processor_name}.log' - # stdin.write(f"echo starting processing worker with '{cmd}' >> '{log_path}'\n") - # stdin.write(f'{cmd} >> {log_path} 2>&1 &\n') - stdin.write(f'{cmd} &\n') + self.log.debug(f'About to execute command: {cmd}') + stdin.write(f'{cmd}\n') stdin.write('echo xyz$!xyz \n exit \n') output = stdout.read().decode('utf-8') stdout.close() @@ -514,9 +510,9 @@ def start_native_processor_server( self.log.info(f"Starting native processor server: {processor_name} on {agent_address}") channel = ssh_client.invoke_shell() stdin, stdout = channel.makefile('wb'), channel.makefile('rb') - cmd = f'{processor_name} server --address {agent_address} --database {database_url}' - stdin.write(f"echo starting processor server with '{cmd}'\n") - stdin.write(f'{cmd} &\n') + cmd = f'{processor_name} server --address {agent_address} --database {database_url} &' + self.log.debug(f'About to execute command: {cmd}') + stdin.write(f'{cmd}\n') stdin.write('echo xyz$!xyz \n exit \n') output = stdout.read().decode('utf-8') stdout.close() @@ -525,9 +521,8 @@ def start_native_processor_server( # TODO: No support for TCP version yet def start_unix_mets_server(self, mets_path: str) -> str: - socket_file = f'{safe_filename(mets_path)}.sock' - log_path = f'/tmp/{safe_filename(mets_path)}.log' - mets_server_url = f'/tmp/{socket_file}' + log_file = get_mets_server_logging_file_path(mets_path=mets_path) + mets_server_url = f'/tmp/{safe_filename(mets_path)}.sock' if is_mets_server_running(mets_server_url=mets_server_url): self.log.info(f"The mets server is already started: {mets_server_url}") @@ -539,8 +534,8 @@ def start_unix_mets_server(self, mets_path: str) -> str: args=['nohup', 'ocrd', 'workspace', '--mets-server-url', f'{mets_server_url}', '-d', f'{cwd}', 'server', 'start'], shell=False, - stdout=open(log_path, 'w'), - stderr=open(log_path, 'a'), + stdout=open(file=log_file, mode='w'), + stderr=open(file=log_file, mode='w'), cwd=cwd, universal_newlines=True ) diff --git a/ocrd_network/ocrd_network/logging.py b/ocrd_network/ocrd_network/logging.py new file mode 100644 index 0000000000..78bfba7bd0 --- /dev/null +++ b/ocrd_network/ocrd_network/logging.py @@ -0,0 +1,48 @@ +from os import makedirs +from os.path import join +from ocrd_utils import safe_filename + +OCRD_NETWORK_MODULES = [ + "mets_servers", + "processing_jobs", + "processing_servers", + "processing_workers", + "processor_servers" +] + + +def get_root_logging_dir(module_name: str) -> str: + if module_name not in OCRD_NETWORK_MODULES: + raise ValueError(f"Invalid module name: {module_name}, should be one of: {OCRD_NETWORK_MODULES}") + # TODO: Utilize env variable to set the root + module_log_dir = join("/tmp/ocrd_network_logs", module_name) + makedirs(name=module_log_dir, exist_ok=True) + return module_log_dir + + +def get_cache_locked_pages_logging_file_path() -> str: + return join(get_root_logging_dir("processing_servers"), f"cache_locked_pages.log") + + +def get_cache_processing_requests_logging_file_path() -> str: + return join(get_root_logging_dir("processing_servers"), f"cache_processing_requests.log") + + +def get_processing_job_logging_file_path(job_id: str) -> str: + return join(get_root_logging_dir("processing_jobs"), f"{job_id}.log") + + +def get_processing_server_logging_file_path(pid: int) -> str: + return join(get_root_logging_dir("processing_servers"), f"server.{pid}.log") + + +def get_processing_worker_logging_file_path(processor_name: str, pid: int) -> str: + return join(get_root_logging_dir("processing_workers"), f"worker.{pid}.{processor_name}.log") + + +def get_processor_server_logging_file_path(processor_name: str, pid: int) -> str: + return join(get_root_logging_dir("processor_servers"), f"server.{pid}.{processor_name}.log") + + +def get_mets_server_logging_file_path(mets_path: str) -> str: + return join(get_root_logging_dir("mets_servers"), f"{safe_filename(mets_path)}.log") diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index befbcb8e37..e719285504 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -1,6 +1,8 @@ import json +import logging import requests import httpx +from os import getpid from typing import Dict, List import uvicorn @@ -30,6 +32,7 @@ db_update_workspace ) from .deployer import Deployer +from .logging import get_processing_server_logging_file_path from .models import ( DBProcessorJob, DBWorkflowJob, @@ -82,6 +85,11 @@ def __init__(self, config_path: str, host: str, port: int) -> None: description='OCR-D Processing Server' ) self.log = getLogger('ocrd_network.processing_server') + log_file = get_processing_server_logging_file_path(pid=getpid()) + file_handler = logging.FileHandler(filename=log_file, mode='a') + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + self.log.addHandler(file_handler) + self.log.info(f"Downloading ocrd all tool json") self.ocrd_all_tool_json = download_ocrd_all_tool_json( ocrd_all_url="https://ocr-d.de/js/ocrd-all-tool.json" diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index e41a2aca80..f7a640bdf5 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -10,21 +10,22 @@ from datetime import datetime import logging -from os import getpid, makedirs - +from os import getpid +from time import sleep import pika.spec import pika.adapters.blocking_connection from pika.exceptions import AMQPConnectionError -from ocrd_utils import getLogger - -from time import sleep - +from ocrd_utils import config, getLogger from .database import ( sync_initiate_database, sync_db_get_workspace, sync_db_update_processing_job, ) +from .logging import ( + get_processing_job_logging_file_path, + get_processing_worker_logging_file_path +) from .models import StateEnum from .process_helpers import invoke_processor from .rabbitmq_utils import ( @@ -39,19 +40,15 @@ verify_database_uri, verify_and_parse_mq_uri ) -from ocrd_utils import config class ProcessingWorker: - def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None, log_filename:str=None) -> None: + def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: self.log = getLogger(f'ocrd_network.processing_worker') - if not log_filename: - log_filename = f'/tmp/ocrd_worker_{processor_name}.{getpid()}.log' - self.log_filename = log_filename - # TODO: Use that handler once the separate job logs is resolved - # file_handler = logging.FileHandler(log_filename, mode='a') - # file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) - # self.log.addHandler(file_handler) + log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) + file_handler = logging.FileHandler(filename=log_file, mode='a') + file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + self.log.addHandler(file_handler) try: verify_database_uri(mongodb_addr) @@ -214,9 +211,7 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: start_time=start_time ) try: - # TODO: Refactor the root logging dir for jobs - # makedirs(name='/tmp/ocrd_processing_jobs_logs', exist_ok=True) - # log_filename = f'/tmp/ocrd_processing_jobs_logs/{job_id}.log' + job_log_file = get_processing_job_logging_file_path(job_id=job_id) invoke_processor( processor_class=self.processor_class, executable=self.processor_name, @@ -224,7 +219,7 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: input_file_grps=input_file_grps, output_file_grps=output_file_grps, page_id=page_id, - log_filename=self.log_filename, + log_filename=job_log_file, parameters=processing_message.parameters, mets_server_url=mets_server_url ) diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 46211aaea3..b8a9f84ce5 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -18,6 +18,7 @@ db_update_processing_job, initiate_database ) +from .logging import get_processor_server_logging_file_path from .models import ( PYJobInput, PYJobOutput, @@ -49,9 +50,9 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= title=f'OCR-D Processor Server', description='OCR-D Processor Server' ) - logging_suffix = f'{processor_name}.{getpid()}' self.log = getLogger('ocrd_network.processor_server') - file_handler = logging.FileHandler(f'/tmp/ocrd_server_{logging_suffix}.log', mode='a') + log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid()) + file_handler = logging.FileHandler(filename=log_file, mode='a') file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) self.log.addHandler(file_handler) diff --git a/ocrd_network/ocrd_network/server_cache.py b/ocrd_network/ocrd_network/server_cache.py index 9863dcaa06..b2158f77f6 100644 --- a/ocrd_network/ocrd_network/server_cache.py +++ b/ocrd_network/ocrd_network/server_cache.py @@ -1,8 +1,13 @@ from __future__ import annotations from typing import Dict, List from logging import DEBUG, getLogger, FileHandler +from os import makedirs from .database import db_get_processing_job, db_update_processing_job +from .logging import ( + get_cache_locked_pages_logging_file_path, + get_cache_processing_requests_logging_file_path +) from .models import PYJobInput, StateEnum __all__ = [ @@ -16,7 +21,8 @@ def __init__(self) -> None: self.log = getLogger("ocrd_network.server_cache.locked_pages") # TODO: remove this when refactoring the logging self.log.setLevel(DEBUG) - log_fh = FileHandler(f'/tmp/ocrd_processing_server_cache_locked_pages.log') + log_file = get_cache_locked_pages_logging_file_path() + log_fh = FileHandler(filename=log_file, mode='a') log_fh.setLevel(DEBUG) self.log.addHandler(log_fh) @@ -111,7 +117,8 @@ def __init__(self) -> None: self.log = getLogger("ocrd_network.server_cache.processing_requests") # TODO: remove this when refactoring the logging self.log.setLevel(DEBUG) - log_fh = FileHandler(f'/tmp/ocrd_processing_server_cache_processing_requests.log') + log_file = get_cache_processing_requests_logging_file_path() + log_fh = FileHandler(filename=log_file, mode='a') log_fh.setLevel(DEBUG) self.log.addHandler(log_fh) From dbf06b649e494dff268ceb3318e66d3f9b4731c6 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 16:43:18 +0200 Subject: [PATCH 02/11] add log file to job models --- ocrd_network/ocrd_network/database.py | 2 ++ ocrd_network/ocrd_network/models/job.py | 5 ++++- ocrd_network/ocrd_network/processing_worker.py | 5 +++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ocrd_network/ocrd_network/database.py b/ocrd_network/ocrd_network/database.py index 58dcb465e5..019eb071c6 100644 --- a/ocrd_network/ocrd_network/database.py +++ b/ocrd_network/ocrd_network/database.py @@ -166,6 +166,8 @@ async def db_update_processing_job(job_id: str, **kwargs) -> DBProcessorJob: job.path_to_mets = value elif key == 'exec_time': job.exec_time = value + elif key == 'log_file_path': + job.log_file_path = value else: raise ValueError(f'Field "{key}" is not updatable.') await job.save() diff --git a/ocrd_network/ocrd_network/models/job.py b/ocrd_network/ocrd_network/models/job.py index 8aa92ca5bd..779f6e066c 100644 --- a/ocrd_network/ocrd_network/models/job.py +++ b/ocrd_network/ocrd_network/models/job.py @@ -66,6 +66,7 @@ class PYJobOutput(BaseModel): input_file_grps: List[str] output_file_grps: Optional[List[str]] page_id: Optional[str] = None + log_file_path: Optional[str] class DBProcessorJob(Document): @@ -88,6 +89,7 @@ class DBProcessorJob(Document): start_time: Optional[datetime] end_time: Optional[datetime] exec_time: Optional[str] + log_file_path: Optional[str] class Settings: use_enum_values = True @@ -101,7 +103,8 @@ def to_job_output(self) -> PYJobOutput: workspace_id=self.workspace_id, input_file_grps=self.input_file_grps, output_file_grps=self.output_file_grps, - page_id=self.page_id + page_id=self.page_id, + log_file_path=self.log_file_path ) diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index f7a640bdf5..50cd279223 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -204,14 +204,15 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None: execution_failed = False self.log.debug(f'Invoking processor: {self.processor_name}') start_time = datetime.now() + job_log_file = get_processing_job_logging_file_path(job_id=job_id) sync_db_update_processing_job( job_id=job_id, state=StateEnum.running, path_to_mets=path_to_mets, - start_time=start_time + start_time=start_time, + log_file_path=job_log_file ) try: - job_log_file = get_processing_job_logging_file_path(job_id=job_id) invoke_processor( processor_class=self.processor_class, executable=self.processor_name, From 3fa13707747c51dabc17415ecf07d5e25a57bc35 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 17:17:43 +0200 Subject: [PATCH 03/11] fix bug: file mode a --- ocrd_network/ocrd_network/deployer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index ebcd8f6fe5..81855cc25a 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -535,7 +535,7 @@ def start_unix_mets_server(self, mets_path: str) -> str: '-d', f'{cwd}', 'server', 'start'], shell=False, stdout=open(file=log_file, mode='w'), - stderr=open(file=log_file, mode='w'), + stderr=open(file=log_file, mode='a'), cwd=cwd, universal_newlines=True ) From 8b07f073e2aa78b6262a5993754c74499667b012 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 20:03:04 +0200 Subject: [PATCH 04/11] improve: use tempdir --- ocrd_network/ocrd_network/deployer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 81855cc25a..a5ca5d075b 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -12,6 +12,7 @@ from pathlib import Path import subprocess from time import sleep +from tempfile import gettempdir from ocrd_utils import getLogger, safe_filename @@ -522,7 +523,7 @@ def start_native_processor_server( # TODO: No support for TCP version yet def start_unix_mets_server(self, mets_path: str) -> str: log_file = get_mets_server_logging_file_path(mets_path=mets_path) - mets_server_url = f'/tmp/{safe_filename(mets_path)}.sock' + mets_server_url = f'{gettempdir()}/{safe_filename(mets_path)}.sock' if is_mets_server_running(mets_server_url=mets_server_url): self.log.info(f"The mets server is already started: {mets_server_url}") From a083808d547f62cd66519e0904ba4cc10e030502 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 20:13:05 +0200 Subject: [PATCH 05/11] use pathlib instead of os.path --- ocrd_network/ocrd_network/logging.py | 38 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/ocrd_network/ocrd_network/logging.py b/ocrd_network/ocrd_network/logging.py index 78bfba7bd0..503c4d7efb 100644 --- a/ocrd_network/ocrd_network/logging.py +++ b/ocrd_network/ocrd_network/logging.py @@ -1,7 +1,7 @@ -from os import makedirs -from os.path import join +from pathlib import Path from ocrd_utils import safe_filename + OCRD_NETWORK_MODULES = [ "mets_servers", "processing_jobs", @@ -11,38 +11,38 @@ ] -def get_root_logging_dir(module_name: str) -> str: +def get_root_logging_dir(module_name: str) -> Path: if module_name not in OCRD_NETWORK_MODULES: raise ValueError(f"Invalid module name: {module_name}, should be one of: {OCRD_NETWORK_MODULES}") # TODO: Utilize env variable to set the root - module_log_dir = join("/tmp/ocrd_network_logs", module_name) - makedirs(name=module_log_dir, exist_ok=True) + module_log_dir = Path("/tmp", "ocrd_network_logs", module_name) + module_log_dir.mkdir(parents=True, exist_ok=True) return module_log_dir -def get_cache_locked_pages_logging_file_path() -> str: - return join(get_root_logging_dir("processing_servers"), f"cache_locked_pages.log") +def get_cache_locked_pages_logging_file_path() -> Path: + return get_root_logging_dir("processing_servers") / "cache_locked_pages.log" -def get_cache_processing_requests_logging_file_path() -> str: - return join(get_root_logging_dir("processing_servers"), f"cache_processing_requests.log") +def get_cache_processing_requests_logging_file_path() -> Path: + return get_root_logging_dir("processing_servers") / "cache_processing_requests.log" -def get_processing_job_logging_file_path(job_id: str) -> str: - return join(get_root_logging_dir("processing_jobs"), f"{job_id}.log") +def get_processing_job_logging_file_path(job_id: str) -> Path: + return get_root_logging_dir("processing_jobs") / f"{job_id}.log" -def get_processing_server_logging_file_path(pid: int) -> str: - return join(get_root_logging_dir("processing_servers"), f"server.{pid}.log") +def get_processing_server_logging_file_path(pid: int) -> Path: + return get_root_logging_dir("processing_servers") / f"server.{pid}.log" -def get_processing_worker_logging_file_path(processor_name: str, pid: int) -> str: - return join(get_root_logging_dir("processing_workers"), f"worker.{pid}.{processor_name}.log") +def get_processing_worker_logging_file_path(processor_name: str, pid: int) -> Path: + return get_root_logging_dir("processing_workers") / f"worker.{pid}.{processor_name}.log" -def get_processor_server_logging_file_path(processor_name: str, pid: int) -> str: - return join(get_root_logging_dir("processor_servers"), f"server.{pid}.{processor_name}.log") +def get_processor_server_logging_file_path(processor_name: str, pid: int) -> Path: + return get_root_logging_dir("processor_servers") / f"server.{pid}.{processor_name}.log" -def get_mets_server_logging_file_path(mets_path: str) -> str: - return join(get_root_logging_dir("mets_servers"), f"{safe_filename(mets_path)}.log") +def get_mets_server_logging_file_path(mets_path: str) -> Path: + return get_root_logging_dir("mets_servers") / f"{safe_filename(mets_path)}.log" From 6bfae6abbcb32e480a73897e81fd681c202be952 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Wed, 11 Oct 2023 20:21:15 +0200 Subject: [PATCH 06/11] use ocrd_utils LOG_FORMAT --- ocrd_network/ocrd_network/processing_server.py | 8 ++++---- ocrd_network/ocrd_network/processing_worker.py | 8 ++++---- ocrd_network/ocrd_network/processor_server.py | 9 +++++---- ocrd_network/ocrd_network/server_cache.py | 12 ++++-------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index e719285504..a5cfe5e9c5 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -1,5 +1,5 @@ import json -import logging +from logging import FileHandler, Formatter import requests import httpx from os import getpid @@ -18,7 +18,7 @@ from pika.exceptions import ChannelClosedByBroker from ocrd.task_sequence import ProcessorTask -from ocrd_utils import initLogging, getLogger +from ocrd_utils import initLogging, getLogger, LOG_FORMAT from ocrd import Resolver, Workspace from pathlib import Path from .database import ( @@ -86,8 +86,8 @@ def __init__(self, config_path: str, host: str, port: int) -> None: ) self.log = getLogger('ocrd_network.processing_server') log_file = get_processing_server_logging_file_path(pid=getpid()) - file_handler = logging.FileHandler(filename=log_file, mode='a') - file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + file_handler = FileHandler(filename=log_file, mode='a') + file_handler.setFormatter(Formatter(LOG_FORMAT)) self.log.addHandler(file_handler) self.log.info(f"Downloading ocrd all tool json") diff --git a/ocrd_network/ocrd_network/processing_worker.py b/ocrd_network/ocrd_network/processing_worker.py index 50cd279223..ff4287c848 100644 --- a/ocrd_network/ocrd_network/processing_worker.py +++ b/ocrd_network/ocrd_network/processing_worker.py @@ -9,14 +9,14 @@ """ from datetime import datetime -import logging +from logging import FileHandler, Formatter from os import getpid from time import sleep import pika.spec import pika.adapters.blocking_connection from pika.exceptions import AMQPConnectionError -from ocrd_utils import config, getLogger +from ocrd_utils import config, getLogger, LOG_FORMAT from .database import ( sync_initiate_database, sync_db_get_workspace, @@ -46,8 +46,8 @@ class ProcessingWorker: def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None: self.log = getLogger(f'ocrd_network.processing_worker') log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid()) - file_handler = logging.FileHandler(filename=log_file, mode='a') - file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + file_handler = FileHandler(filename=log_file, mode='a') + file_handler.setFormatter(Formatter(LOG_FORMAT)) self.log.addHandler(file_handler) try: diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index b8a9f84ce5..24c8395fc9 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -1,5 +1,5 @@ from datetime import datetime -import logging +from logging import FileHandler, Formatter from os import getpid from subprocess import run, PIPE import uvicorn @@ -10,7 +10,8 @@ initLogging, get_ocrd_tool_json, getLogger, - parse_json_string_with_comments, + LOG_FORMAT, + parse_json_string_with_comments ) from .database import ( DBProcessorJob, @@ -52,8 +53,8 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= ) self.log = getLogger('ocrd_network.processor_server') log_file = get_processor_server_logging_file_path(processor_name=processor_name, pid=getpid()) - file_handler = logging.FileHandler(filename=log_file, mode='a') - file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + file_handler = FileHandler(filename=log_file, mode='a') + file_handler.setFormatter(Formatter(LOG_FORMAT)) self.log.addHandler(file_handler) self.db_url = mongodb_addr diff --git a/ocrd_network/ocrd_network/server_cache.py b/ocrd_network/ocrd_network/server_cache.py index b2158f77f6..591ee1c8bb 100644 --- a/ocrd_network/ocrd_network/server_cache.py +++ b/ocrd_network/ocrd_network/server_cache.py @@ -1,8 +1,8 @@ from __future__ import annotations from typing import Dict, List -from logging import DEBUG, getLogger, FileHandler -from os import makedirs +from logging import FileHandler, Formatter +from ocrd_utils import getLogger, LOG_FORMAT from .database import db_get_processing_job, db_update_processing_job from .logging import ( get_cache_locked_pages_logging_file_path, @@ -19,11 +19,9 @@ class CacheLockedPages: def __init__(self) -> None: self.log = getLogger("ocrd_network.server_cache.locked_pages") - # TODO: remove this when refactoring the logging - self.log.setLevel(DEBUG) log_file = get_cache_locked_pages_logging_file_path() log_fh = FileHandler(filename=log_file, mode='a') - log_fh.setLevel(DEBUG) + log_fh.setFormatter(Formatter(LOG_FORMAT)) self.log.addHandler(log_fh) # Used for keeping track of locked pages for a workspace @@ -115,11 +113,9 @@ def unlock_pages( class CacheProcessingRequests: def __init__(self) -> None: self.log = getLogger("ocrd_network.server_cache.processing_requests") - # TODO: remove this when refactoring the logging - self.log.setLevel(DEBUG) log_file = get_cache_processing_requests_logging_file_path() log_fh = FileHandler(filename=log_file, mode='a') - log_fh.setLevel(DEBUG) + log_fh.setFormatter(Formatter(LOG_FORMAT)) self.log.addHandler(log_fh) # Used for buffering/caching processing requests in the Processing Server From 464f8eaf8c2c696668699a880f885dea14847f1b Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Thu, 12 Oct 2023 13:24:59 +0200 Subject: [PATCH 07/11] add logs and sockets env --- ocrd_network/ocrd_network/deployer.py | 9 ++++----- ocrd_network/ocrd_network/logging.py | 5 ++--- ocrd_utils/ocrd_utils/config.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index a5ca5d075b..f8543cf7cc 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -12,9 +12,8 @@ from pathlib import Path import subprocess from time import sleep -from tempfile import gettempdir -from ocrd_utils import getLogger, safe_filename +from ocrd_utils import config, getLogger, safe_filename from .deployment_utils import ( create_docker_client, @@ -521,11 +520,11 @@ def start_native_processor_server( return re_search(r'xyz([0-9]+)xyz', output).group(1) # type: ignore # TODO: No support for TCP version yet - def start_unix_mets_server(self, mets_path: str) -> str: + def start_unix_mets_server(self, mets_path: str) -> Path: log_file = get_mets_server_logging_file_path(mets_path=mets_path) - mets_server_url = f'{gettempdir()}/{safe_filename(mets_path)}.sock' + mets_server_url = Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(mets_path)}.sock") - if is_mets_server_running(mets_server_url=mets_server_url): + if is_mets_server_running(mets_server_url=str(mets_server_url)): self.log.info(f"The mets server is already started: {mets_server_url}") return mets_server_url diff --git a/ocrd_network/ocrd_network/logging.py b/ocrd_network/ocrd_network/logging.py index 503c4d7efb..1d870e21ac 100644 --- a/ocrd_network/ocrd_network/logging.py +++ b/ocrd_network/ocrd_network/logging.py @@ -1,5 +1,5 @@ from pathlib import Path -from ocrd_utils import safe_filename +from ocrd_utils import safe_filename, config OCRD_NETWORK_MODULES = [ @@ -14,8 +14,7 @@ def get_root_logging_dir(module_name: str) -> Path: if module_name not in OCRD_NETWORK_MODULES: raise ValueError(f"Invalid module name: {module_name}, should be one of: {OCRD_NETWORK_MODULES}") - # TODO: Utilize env variable to set the root - module_log_dir = Path("/tmp", "ocrd_network_logs", module_name) + module_log_dir = Path(config.OCRD_NETWORK_LOGS_ROOT_DIR, module_name) module_log_dir.mkdir(parents=True, exist_ok=True) return module_log_dir diff --git a/ocrd_utils/ocrd_utils/config.py b/ocrd_utils/ocrd_utils/config.py index 5631758b17..b6f8b8daa8 100644 --- a/ocrd_utils/ocrd_utils/config.py +++ b/ocrd_utils/ocrd_utils/config.py @@ -9,8 +9,10 @@ from os import environ from pathlib import Path +from tempfile import gettempdir from textwrap import fill, indent + class OcrdEnvVariable(): def __init__(self, name, description, parser=str, validator=lambda val: True, default=[False, None]): @@ -156,6 +158,18 @@ def _ocrd_download_timeout_parser(val): parser=int, default=(True, 3)) +config.add(name="OCRD_NETWORK_SOCKETS_ROOT_DIR", + description="The root directory where all mets server related socket files are created", + parser=lambda val: Path(val), + default=(True, Path(gettempdir(), "ocrd_network_sockets"))) +config.OCRD_NETWORK_SOCKETS_ROOT_DIR.mkdir(parents=True, exist_ok=True) + +config.add(name="OCRD_NETWORK_LOGS_ROOT_DIR", + description="The root directory where all ocrd_network related file logs are stored", + parser=lambda val: Path(val), + default=(True, Path(gettempdir(), "ocrd_network_logs"))) +config.OCRD_NETWORK_LOGS_ROOT_DIR.mkdir(parents=True, exist_ok=True) + config.add("HOME", description="Directory to look for `ocrd_logging.conf`, fallback for unset XDG variables.", # description="HOME directory, cf. https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html", From c81ffcca9c1bf937784c102dadc1f97d6fd05150 Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Thu, 12 Oct 2023 14:17:23 +0200 Subject: [PATCH 08/11] add get log endpoint --- ocrd_network/ocrd_network/processing_server.py | 15 ++++++++++++++- ocrd_network/ocrd_network/processor_server.py | 18 ++++++++++++++++-- ocrd_network/ocrd_network/server_utils.py | 8 ++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/ocrd_network/ocrd_network/processing_server.py b/ocrd_network/ocrd_network/processing_server.py index a5cfe5e9c5..56eb123d10 100644 --- a/ocrd_network/ocrd_network/processing_server.py +++ b/ocrd_network/ocrd_network/processing_server.py @@ -14,7 +14,7 @@ UploadFile ) from fastapi.exceptions import RequestValidationError -from fastapi.responses import JSONResponse +from fastapi.responses import FileResponse, JSONResponse from pika.exceptions import ChannelClosedByBroker from ocrd.task_sequence import ProcessorTask @@ -52,6 +52,7 @@ ) from .server_utils import ( _get_processor_job, + _get_processor_job_log, expand_page_ids, validate_and_return_mets_path, validate_job_input @@ -157,6 +158,15 @@ def __init__(self, config_path: str, host: str, port: int) -> None: response_model_exclude_none=True ) + self.router.add_api_route( + path='/processor/{processor_name}/{job_id}/log', + endpoint=self.get_processor_job_log, + methods=['GET'], + tags=['processing'], + status_code=status.HTTP_200_OK, + summary='Get the log file of a job id' + ) + self.router.add_api_route( path='/result_callback', endpoint=self.remove_from_request_cache, @@ -524,6 +534,9 @@ async def push_to_processor_server( async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput: return await _get_processor_job(self.log, processor_name, job_id) + async def get_processor_job_log(self, processor_name: str, job_id: str) -> FileResponse: + return await _get_processor_job_log(self.log, processor_name, job_id) + async def remove_from_request_cache(self, result_message: PYResultMessage): result_job_id = result_message.job_id result_job_state = result_message.state diff --git a/ocrd_network/ocrd_network/processor_server.py b/ocrd_network/ocrd_network/processor_server.py index 24c8395fc9..670f90ed48 100644 --- a/ocrd_network/ocrd_network/processor_server.py +++ b/ocrd_network/ocrd_network/processor_server.py @@ -5,6 +5,7 @@ import uvicorn from fastapi import FastAPI, HTTPException, status +from fastapi.responses import FileResponse from ocrd_utils import ( initLogging, @@ -30,6 +31,7 @@ from .rabbitmq_utils import OcrdResultMessage from .server_utils import ( _get_processor_job, + _get_processor_job_log, validate_and_return_mets_path, validate_job_input ) @@ -109,6 +111,15 @@ def __init__(self, mongodb_addr: str, processor_name: str = "", processor_class= response_model_exclude_none=True ) + self.router.add_api_route( + path='/{job_id}/log', + endpoint=self.get_processor_job_log, + methods=['GET'], + tags=['processing'], + status_code=status.HTTP_200_OK, + summary='Get the log file of a job id' + ) + async def on_startup(self): await initiate_database(db_url=self.db_url) @@ -239,5 +250,8 @@ def get_version(self) -> str: def run_server(self, host, port): uvicorn.run(self, host=host, port=port) - async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput: - return await _get_processor_job(self.log, processor_name, job_id) + async def get_processor_job(self, job_id: str) -> PYJobOutput: + return await _get_processor_job(self.log, self.processor_name, job_id) + + async def get_processor_job_log(self, job_id: str) -> FileResponse: + return await _get_processor_job_log(self.log, self.processor_name, job_id) diff --git a/ocrd_network/ocrd_network/server_utils.py b/ocrd_network/ocrd_network/server_utils.py index d38933b159..aee646dddc 100644 --- a/ocrd_network/ocrd_network/server_utils.py +++ b/ocrd_network/ocrd_network/server_utils.py @@ -1,5 +1,7 @@ import re from fastapi import HTTPException, status +from fastapi.responses import FileResponse +from pathlib import Path from typing import List from ocrd_validators import ParameterValidator from ocrd_utils import ( @@ -28,6 +30,12 @@ async def _get_processor_job(logger, processor_name: str, job_id: str) -> PYJobO ) +async def _get_processor_job_log(logger, processor_name: str, job_id: str) -> FileResponse: + db_job = await _get_processor_job(logger, processor_name, job_id) + log_file_path = Path(db_job.log_file_path) + return FileResponse(path=log_file_path, filename=log_file_path.name) + + async def validate_and_return_mets_path(logger, job_input: PYJobInput) -> str: # This check is done to return early in case the workspace_id is provided # but the abs mets path cannot be queried from the DB From 76fe9acf88bb76a6fe333c23b5204f00edc76dbf Mon Sep 17 00:00:00 2001 From: mehmedGIT Date: Thu, 12 Oct 2023 14:30:49 +0200 Subject: [PATCH 09/11] fix: use Path not str --- ocrd_network/ocrd_network/deployer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index f8543cf7cc..80b23eb067 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -546,6 +546,7 @@ def start_unix_mets_server(self, mets_path: str) -> Path: def stop_unix_mets_server(self, mets_server_url: str) -> None: self.log.info(f'Stopping UDS mets server: {mets_server_url}') + mets_server_url = Path(mets_server_url) if mets_server_url in self.mets_servers: mets_server_pid = self.mets_servers[mets_server_url] else: From 32cddbeab354b4a7817cebe0093ee1476f3449ad Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 12 Oct 2023 15:02:38 +0200 Subject: [PATCH 10/11] Fix: use Path not str 2 --- ocrd_network/ocrd_network/deployer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd_network/ocrd_network/deployer.py b/ocrd_network/ocrd_network/deployer.py index 80b23eb067..5f25a58d17 100644 --- a/ocrd_network/ocrd_network/deployer.py +++ b/ocrd_network/ocrd_network/deployer.py @@ -546,9 +546,8 @@ def start_unix_mets_server(self, mets_path: str) -> Path: def stop_unix_mets_server(self, mets_server_url: str) -> None: self.log.info(f'Stopping UDS mets server: {mets_server_url}') - mets_server_url = Path(mets_server_url) - if mets_server_url in self.mets_servers: - mets_server_pid = self.mets_servers[mets_server_url] + if Path(mets_server_url) in self.mets_servers: + mets_server_pid = self.mets_servers[Path(mets_server_url)] else: raise Exception(f"Mets server not found: {mets_server_url}") From 4a57777d7500e4437b6c039a93b765b2da46f096 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 12 Oct 2023 17:24:13 +0200 Subject: [PATCH 11/11] ocrd --help: document new config options for ocrd_network --- ocrd/ocrd/cli/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocrd/ocrd/cli/__init__.py b/ocrd/ocrd/cli/__init__.py index 3a164f826f..52e6397b59 100644 --- a/ocrd/ocrd/cli/__init__.py +++ b/ocrd/ocrd/cli/__init__.py @@ -48,6 +48,10 @@ {config.describe('OCRD_PROFILE_FILE')} \b {config.describe('OCRD_PROFILE', wrap_text=False)} +\b +{config.describe('OCRD_NETWORK_SOCKETS_ROOT_DIR')} +\b +{config.describe('OCRD_NETWORK_LOGS_ROOT_DIR')} """ def command_with_replaced_help(*replacements):