Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions ocrd/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@

class DummyProcessor(Processor):
"""
Bare-bones processor that only copies mets:file from input group to output group.
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process(self):
LOG = getLogger('ocrd.dummy')
assert_file_grp_cardinality(self.input_file_grp, 1)
assert_file_grp_cardinality(self.output_file_grp, 1)
copy_files = self.parameter['copy_files']
for input_file in self.input_files:
input_file = self.workspace.download_file(input_file)
file_id = make_file_id(input_file, self.output_file_grp)
Expand All @@ -36,8 +37,8 @@ def process(self):
pcgts = page_from_file(self.workspace.download_file(input_file))
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
if input_file.mimetype == MIMETYPE_PAGE:
LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
# Source file is PAGE-XML: Write out in-memory PcGtsType
self.workspace.add_file(
file_id=file_id,
Expand All @@ -47,24 +48,27 @@ def process(self):
local_filename=local_filename,
content=to_xml(pcgts).encode('utf-8'))
else:
# Source file is not PAGE-XML: Copy byte-by-byte
with open(input_file.local_filename, 'rb') as f:
content = f.read()
self.workspace.add_file(
file_id=file_id,
file_grp=self.output_file_grp,
page_id=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=content)
# Source file is not PAGE-XML: Copy byte-by-byte unless copy_files is False
if not copy_files:
LOG.info("Not copying %s because it is not a PAGE-XML file and copy_files was false" % input_file.local_filename)
else:
LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
with open(input_file.local_filename, 'rb') as f:
content = f.read()
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=content)
if input_file.mimetype.startswith('image/'):
# write out the PAGE-XML representation for this image
page_file_id = file_id + '_PAGE'
pcgts.set_pcGtsId(page_file_id)
pcgts.get_Page().set_imageFilename(local_filename)
pcgts.get_Page().set_imageFilename(local_filename if copy_files else input_file.local_filename)
page_filename = join(self.output_file_grp, file_id + '.xml')
LOG.info("Add PAGE-XML %s generated for %s at %s",
page_file_id, file_id, page_filename)
LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename)
self.workspace.add_file(
file_id=page_file_id,
file_grp=self.output_file_grp,
Expand All @@ -76,7 +80,7 @@ def process(self):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL
kwargs['version'] = '0.0.2'
kwargs['version'] = '0.0.3'
super(DummyProcessor, self).__init__(*args, **kwargs)

@click.command()
Expand Down
11 changes: 9 additions & 2 deletions ocrd/ocrd/processor/builtin/ocrd-tool.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
{
"executable": "ocrd-dummy",
"description": "Bare-bones processor that copies file from input group to output group",
"description": "Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group",
"steps": ["preprocessing/optimization"],
"categories": ["Image preprocessing"],
"input_file_grp": "DUMMY_INPUT",
"output_file_grp": "DUMMY_OUTPUT"
"output_file_grp": "DUMMY_OUTPUT",
"parameters": {
"copy_files": {
"type": "boolean",
"default": false,
"description": "Whether to actually copy files (true) or just create PAGE-XML as a side effect (false)"
}
}
}
38 changes: 37 additions & 1 deletion tests/processor/test_ocrd_dummy.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
# pylint: disable=invalid-name,line-too-long

from io import BytesIO
from pathlib import Path

from PIL import Image

from tests.base import TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module
from ocrd import Resolver, Workspace
from ocrd_utils import MIMETYPE_PAGE
from ocrd_utils import MIMETYPE_PAGE, pushd_popd
from ocrd_modelfactory import page_from_file
from ocrd.processor.base import run_processor
from ocrd.processor.builtin.dummy_processor import DummyProcessor
Expand All @@ -21,6 +26,7 @@ def test_copies_ok(self):
DummyProcessor,
input_file_grp='OCR-D-IMG',
output_file_grp='OUTPUT',
parameter={'copy_files': True},
workspace=workspace
)
output_files = workspace.mets.find_all_files(fileGrp='OUTPUT')
Expand All @@ -38,11 +44,41 @@ def test_copies_ok(self):
DummyProcessor,
input_file_grp='OUTPUT',
output_file_grp='OUTPUT2',
parameter={'copy_files': True},
workspace=workspace
)
output2_files = workspace.mets.find_all_files(fileGrp='OUTPUT2')
output2_files.sort(key=lambda x: x.url)
self.assertEqual(len(output2_files), 3)

def test_copy_file_false(tmpdir):
workspace = Resolver().workspace_from_nothing(directory=tmpdir)
for i in range(10):
pil_image = Image.new('RGB', (100, 100))
bhandle = BytesIO()
pil_image.save(bhandle, format='PNG')
workspace.add_file(
'IMG',
file_id=f'IMG_{i}',
mimetype='image/png',
page_id=f'PHYS_{i}',
local_filename=f'IMG/IMG_{i}.png',
content=bhandle.getvalue(),
)
assert len(workspace.mets.find_all_files(fileGrp='IMG')) == 10
run_processor(
DummyProcessor,
workspace=workspace,
input_file_grp='IMG',
output_file_grp='OUTPUT',
parameter={'copy_files': False},
)
assert len(workspace.mets.find_all_files()) == 20, 'We expect 10 PAGE files for the 10 image files'
page_img0 = next(workspace.mets.find_files(pageId='PHYS_0', fileGrp='OUTPUT'))
pcgts = page_from_file(workspace.download_file(page_img0))
assert pcgts.get_Page().imageWidth == 100, 'image is 100 pix wide'
assert pcgts.get_Page().imageHeight == 100, 'image is 100 pix long'
assert pcgts.get_Page().imageFilename == 'IMG/IMG_0.png', 'imageFilename references the original img path'

if __name__ == "__main__":
main(__file__)
4 changes: 2 additions & 2 deletions tests/test_task_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def test_task_run(self):
ws.save_mets()
files_before = len(ws.mets.find_all_files())
run_tasks('mets.xml', 'DEBUG', None, [
"dummy -I OCR-D-IMG -O GRP1",
"dummy -I GRP1 -O GRP2",
"dummy -I OCR-D-IMG -O GRP1 -P copy_files true",
"dummy -I GRP1 -O GRP2 -P copy_files true",
])
ws.reload_mets()
# step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1
Expand Down