Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions ocrd_utils/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,24 +62,25 @@ def make_file_id(ocrd_file, output_file_grp):
Derive a new file ID for an output file from an existing input file ``ocrd_file``
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``.
Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
(as a fallback counter). Increment counter until there is no more ID conflict.
(as a fallback counter), and increment counter until there is no more ID conflict.
"""
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
if ret == ocrd_file.ID:
m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
if m:
n = int(m.group(1))
if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID:
# still sufficiently unique
ret = output_file_grp + '_' + ocrd_file.ID
else:
ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
try:
n = ids.index(ocrd_file.ID) + 1
except ValueError:
n = len(ids)
ret = concat_padded(output_file_grp, n)
while next(ocrd_file.mets.find_files(ID=ret), None):
n += 1
ret = concat_padded(output_file_grp, n)
while next(ocrd_file.mets.find_files(ID=ret), None):
n += 1
ret = concat_padded(output_file_grp, n)
if not REGEX_FILE_ID.fullmatch(ret):
ret = ret.replace(':', '_')
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
Expand Down
12 changes: 11 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,20 @@ def test_make_file_id_570(self):
def test_make_file_id_605(self):
"""https://github.com/OCR-D/core/pull/605"""
mets = OcrdMets.empty_mets()
f = mets.add_file('1:!GRP', ID='FOO_0001', pageId='phys0001')
f = mets.add_file('2:!GRP', ID='FOO_0001', pageId='phys0001')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably disallow filegroups starting with a number because the resulting ID might lead to an invalid xsd:ID because they mustn't start with a number.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if #746 kicks in, such tests should all fail...

f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002')
self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002')

def test_make_file_id_744(self):
"""
https://github.com/OCR-D/core/pull/744
> Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works.
"""
mets = OcrdMets.empty_mets()
f = mets.add_file('2:!GRP', ID='img1796-97_00000024_img', pageId='phys0024')
f = mets.add_file('2:!GRP', ID='img1796-97_00000025_img', pageId='phys0025')
self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0025')

def test_generate_range(self):
assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005']
with self.assertRaisesRegex(ValueError, 'Unable to generate range'):
Expand Down