Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ocrd/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
{config.describe('OCRD_PROFILE_FILE')}
\b
{config.describe('OCRD_PROFILE', wrap_text=False)}
\b
{config.describe('OCRD_NETWORK_SOCKETS_ROOT_DIR')}
\b
{config.describe('OCRD_NETWORK_LOGS_ROOT_DIR')}
"""

def command_with_replaced_help(*replacements):
Expand Down
2 changes: 2 additions & 0 deletions ocrd_network/ocrd_network/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ async def db_update_processing_job(job_id: str, **kwargs) -> DBProcessorJob:
job.path_to_mets = value
elif key == 'exec_time':
job.exec_time = value
elif key == 'log_file_path':
job.log_file_path = value
else:
raise ValueError(f'Field "{key}" is not updatable.')
await job.save()
Expand Down
37 changes: 16 additions & 21 deletions ocrd_network/ocrd_network/deployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,19 @@
from __future__ import annotations
from typing import Dict, List, Union
from re import search as re_search
from os import getpid
from pathlib import Path
import subprocess
from time import sleep

from ocrd_utils import getLogger, safe_filename
from ocrd_utils import config, getLogger, safe_filename

from .deployment_utils import (
create_docker_client,
DeployType,
verify_mongodb_available,
verify_rabbitmq_available,
)

from .logging import get_mets_server_logging_file_path
from .runtime_data import (
DataHost,
DataMongoDB,
Expand Down Expand Up @@ -468,18 +467,15 @@ def start_native_processor(
self.log.info(f'Starting native processing worker: {processor_name}')
channel = ssh_client.invoke_shell()
stdin, stdout = channel.makefile('wb'), channel.makefile('rb')
cmd = f'{processor_name} worker --database {database_url} --queue {queue_url}'
cmd = f'{processor_name} worker --database {database_url} --queue {queue_url} &'
# the only way (I could find) to make it work to start a process in the background and
# return early is this construction. The pid of the last started background process is
# printed with `echo $!` but it is printed inbetween other output. Because of that I added
# `xyz` before and after the code to easily be able to filter out the pid via regex when
# returning from the function

# TODO: Check here again
# log_path = f'/tmp/deployed_{processor_name}.log'
# stdin.write(f"echo starting processing worker with '{cmd}' >> '{log_path}'\n")
# stdin.write(f'{cmd} >> {log_path} 2>&1 &\n')
stdin.write(f'{cmd} &\n')
self.log.debug(f'About to execute command: {cmd}')
stdin.write(f'{cmd}\n')
stdin.write('echo xyz$!xyz \n exit \n')
output = stdout.read().decode('utf-8')
stdout.close()
Expand Down Expand Up @@ -514,22 +510,21 @@ def start_native_processor_server(
self.log.info(f"Starting native processor server: {processor_name} on {agent_address}")
channel = ssh_client.invoke_shell()
stdin, stdout = channel.makefile('wb'), channel.makefile('rb')
cmd = f'{processor_name} server --address {agent_address} --database {database_url}'
stdin.write(f"echo starting processor server with '{cmd}'\n")
stdin.write(f'{cmd} &\n')
cmd = f'{processor_name} server --address {agent_address} --database {database_url} &'
self.log.debug(f'About to execute command: {cmd}')
stdin.write(f'{cmd}\n')
stdin.write('echo xyz$!xyz \n exit \n')
output = stdout.read().decode('utf-8')
stdout.close()
stdin.close()
return re_search(r'xyz([0-9]+)xyz', output).group(1) # type: ignore

# TODO: No support for TCP version yet
def start_unix_mets_server(self, mets_path: str) -> str:
socket_file = f'{safe_filename(mets_path)}.sock'
log_path = f'/tmp/{safe_filename(mets_path)}.log'
mets_server_url = f'/tmp/{socket_file}'
def start_unix_mets_server(self, mets_path: str) -> Path:
log_file = get_mets_server_logging_file_path(mets_path=mets_path)
mets_server_url = Path(config.OCRD_NETWORK_SOCKETS_ROOT_DIR, f"{safe_filename(mets_path)}.sock")

if is_mets_server_running(mets_server_url=mets_server_url):
if is_mets_server_running(mets_server_url=str(mets_server_url)):
self.log.info(f"The mets server is already started: {mets_server_url}")
return mets_server_url

Expand All @@ -539,8 +534,8 @@ def start_unix_mets_server(self, mets_path: str) -> str:
args=['nohup', 'ocrd', 'workspace', '--mets-server-url', f'{mets_server_url}',
'-d', f'{cwd}', 'server', 'start'],
shell=False,
stdout=open(log_path, 'w'),
stderr=open(log_path, 'a'),
stdout=open(file=log_file, mode='w'),
stderr=open(file=log_file, mode='a'),
cwd=cwd,
universal_newlines=True
)
Expand All @@ -551,8 +546,8 @@ def start_unix_mets_server(self, mets_path: str) -> str:

def stop_unix_mets_server(self, mets_server_url: str) -> None:
self.log.info(f'Stopping UDS mets server: {mets_server_url}')
if mets_server_url in self.mets_servers:
mets_server_pid = self.mets_servers[mets_server_url]
if Path(mets_server_url) in self.mets_servers:
mets_server_pid = self.mets_servers[Path(mets_server_url)]
else:
raise Exception(f"Mets server not found: {mets_server_url}")

Expand Down
47 changes: 47 additions & 0 deletions ocrd_network/ocrd_network/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from pathlib import Path
from ocrd_utils import safe_filename, config


OCRD_NETWORK_MODULES = [
"mets_servers",
"processing_jobs",
"processing_servers",
"processing_workers",
"processor_servers"
]


def get_root_logging_dir(module_name: str) -> Path:
if module_name not in OCRD_NETWORK_MODULES:
raise ValueError(f"Invalid module name: {module_name}, should be one of: {OCRD_NETWORK_MODULES}")
module_log_dir = Path(config.OCRD_NETWORK_LOGS_ROOT_DIR, module_name)
module_log_dir.mkdir(parents=True, exist_ok=True)
return module_log_dir


def get_cache_locked_pages_logging_file_path() -> Path:
return get_root_logging_dir("processing_servers") / "cache_locked_pages.log"


def get_cache_processing_requests_logging_file_path() -> Path:
return get_root_logging_dir("processing_servers") / "cache_processing_requests.log"


def get_processing_job_logging_file_path(job_id: str) -> Path:
return get_root_logging_dir("processing_jobs") / f"{job_id}.log"


def get_processing_server_logging_file_path(pid: int) -> Path:
return get_root_logging_dir("processing_servers") / f"server.{pid}.log"


def get_processing_worker_logging_file_path(processor_name: str, pid: int) -> Path:
return get_root_logging_dir("processing_workers") / f"worker.{pid}.{processor_name}.log"


def get_processor_server_logging_file_path(processor_name: str, pid: int) -> Path:
return get_root_logging_dir("processor_servers") / f"server.{pid}.{processor_name}.log"


def get_mets_server_logging_file_path(mets_path: str) -> Path:
return get_root_logging_dir("mets_servers") / f"{safe_filename(mets_path)}.log"
5 changes: 4 additions & 1 deletion ocrd_network/ocrd_network/models/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class PYJobOutput(BaseModel):
input_file_grps: List[str]
output_file_grps: Optional[List[str]]
page_id: Optional[str] = None
log_file_path: Optional[str]


class DBProcessorJob(Document):
Expand All @@ -88,6 +89,7 @@ class DBProcessorJob(Document):
start_time: Optional[datetime]
end_time: Optional[datetime]
exec_time: Optional[str]
log_file_path: Optional[str]

class Settings:
use_enum_values = True
Expand All @@ -101,7 +103,8 @@ def to_job_output(self) -> PYJobOutput:
workspace_id=self.workspace_id,
input_file_grps=self.input_file_grps,
output_file_grps=self.output_file_grps,
page_id=self.page_id
page_id=self.page_id,
log_file_path=self.log_file_path
)


Expand Down
25 changes: 23 additions & 2 deletions ocrd_network/ocrd_network/processing_server.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
from logging import FileHandler, Formatter
import requests
import httpx
from os import getpid
from typing import Dict, List
import uvicorn

Expand All @@ -12,11 +14,11 @@
UploadFile
)
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from fastapi.responses import FileResponse, JSONResponse

from pika.exceptions import ChannelClosedByBroker
from ocrd.task_sequence import ProcessorTask
from ocrd_utils import initLogging, getLogger
from ocrd_utils import initLogging, getLogger, LOG_FORMAT
from ocrd import Resolver, Workspace
from pathlib import Path
from .database import (
Expand All @@ -30,6 +32,7 @@
db_update_workspace
)
from .deployer import Deployer
from .logging import get_processing_server_logging_file_path
from .models import (
DBProcessorJob,
DBWorkflowJob,
Expand All @@ -49,6 +52,7 @@
)
from .server_utils import (
_get_processor_job,
_get_processor_job_log,
expand_page_ids,
validate_and_return_mets_path,
validate_job_input
Expand Down Expand Up @@ -82,6 +86,11 @@ def __init__(self, config_path: str, host: str, port: int) -> None:
description='OCR-D Processing Server'
)
self.log = getLogger('ocrd_network.processing_server')
log_file = get_processing_server_logging_file_path(pid=getpid())
file_handler = FileHandler(filename=log_file, mode='a')
file_handler.setFormatter(Formatter(LOG_FORMAT))
self.log.addHandler(file_handler)

self.log.info(f"Downloading ocrd all tool json")
self.ocrd_all_tool_json = download_ocrd_all_tool_json(
ocrd_all_url="https://ocr-d.de/js/ocrd-all-tool.json"
Expand Down Expand Up @@ -149,6 +158,15 @@ def __init__(self, config_path: str, host: str, port: int) -> None:
response_model_exclude_none=True
)

self.router.add_api_route(
path='/processor/{processor_name}/{job_id}/log',
endpoint=self.get_processor_job_log,
methods=['GET'],
tags=['processing'],
status_code=status.HTTP_200_OK,
summary='Get the log file of a job id'
)

self.router.add_api_route(
path='/result_callback',
endpoint=self.remove_from_request_cache,
Expand Down Expand Up @@ -516,6 +534,9 @@ async def push_to_processor_server(
async def get_processor_job(self, processor_name: str, job_id: str) -> PYJobOutput:
return await _get_processor_job(self.log, processor_name, job_id)

async def get_processor_job_log(self, processor_name: str, job_id: str) -> FileResponse:
return await _get_processor_job_log(self.log, processor_name, job_id)

async def remove_from_request_cache(self, result_message: PYResultMessage):
result_job_id = result_message.job_id
result_job_state = result_message.state
Expand Down
38 changes: 17 additions & 21 deletions ocrd_network/ocrd_network/processing_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@
"""

from datetime import datetime
import logging
from os import getpid, makedirs

from logging import FileHandler, Formatter
from os import getpid
from time import sleep
import pika.spec
import pika.adapters.blocking_connection
from pika.exceptions import AMQPConnectionError

from ocrd_utils import getLogger

from time import sleep

from ocrd_utils import config, getLogger, LOG_FORMAT
from .database import (
sync_initiate_database,
sync_db_get_workspace,
sync_db_update_processing_job,
)
from .logging import (
get_processing_job_logging_file_path,
get_processing_worker_logging_file_path
)
from .models import StateEnum
from .process_helpers import invoke_processor
from .rabbitmq_utils import (
Expand All @@ -39,19 +40,15 @@
verify_database_uri,
verify_and_parse_mq_uri
)
from ocrd_utils import config


class ProcessingWorker:
def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None, log_filename:str=None) -> None:
def __init__(self, rabbitmq_addr, mongodb_addr, processor_name, ocrd_tool: dict, processor_class=None) -> None:
self.log = getLogger(f'ocrd_network.processing_worker')
if not log_filename:
log_filename = f'/tmp/ocrd_worker_{processor_name}.{getpid()}.log'
self.log_filename = log_filename
# TODO: Use that handler once the separate job logs is resolved
# file_handler = logging.FileHandler(log_filename, mode='a')
# file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
# self.log.addHandler(file_handler)
log_file = get_processing_worker_logging_file_path(processor_name=processor_name, pid=getpid())
file_handler = FileHandler(filename=log_file, mode='a')
file_handler.setFormatter(Formatter(LOG_FORMAT))
self.log.addHandler(file_handler)

try:
verify_database_uri(mongodb_addr)
Expand Down Expand Up @@ -207,24 +204,23 @@ def process_message(self, processing_message: OcrdProcessingMessage) -> None:
execution_failed = False
self.log.debug(f'Invoking processor: {self.processor_name}')
start_time = datetime.now()
job_log_file = get_processing_job_logging_file_path(job_id=job_id)
sync_db_update_processing_job(
job_id=job_id,
state=StateEnum.running,
path_to_mets=path_to_mets,
start_time=start_time
start_time=start_time,
log_file_path=job_log_file
)
try:
# TODO: Refactor the root logging dir for jobs
# makedirs(name='/tmp/ocrd_processing_jobs_logs', exist_ok=True)
# log_filename = f'/tmp/ocrd_processing_jobs_logs/{job_id}.log'
invoke_processor(
processor_class=self.processor_class,
executable=self.processor_name,
abs_path_to_mets=path_to_mets,
input_file_grps=input_file_grps,
output_file_grps=output_file_grps,
page_id=page_id,
log_filename=self.log_filename,
log_filename=job_log_file,
parameters=processing_message.parameters,
mets_server_url=mets_server_url
)
Expand Down
Loading