Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocrd.processor.base: add property zip_input_files #635

Merged
merged 10 commits into from
Nov 3, 2020
147 changes: 127 additions & 20 deletions ocrd/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import os
import json
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE
from ocrd_utils import VERSION as OCRD_VERSION, MIMETYPE_PAGE, getLogger
from ocrd_validators import ParameterValidator
from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType

Expand All @@ -15,9 +15,11 @@

class Processor():
"""
A processor runs an algorithm based on the workspace, the mets.xml in the
workspace (and the input files defined therein) as well as optional
parameter.
A processor is an OCR-D compliant command-line-interface for executing
kba marked this conversation as resolved.
Show resolved Hide resolved
a single workflow step on the workspace (represented by local METS). It
reads input files for all or requested physical pages of the input fileGrp(s),
and writes output files for them into the output fileGrp(s). It may take
a number of optional or mandatory parameters.
"""

def __init__(
Expand Down Expand Up @@ -100,26 +102,131 @@ def add_metadata(self, pcgts):
@property
def input_files(self):
"""
List the input files.
List the input files (for single input file groups).

- If there's a PAGE-XML for the page, take it (and forget about all
For each physical page:
- If there is a single PAGE-XML for the page, take it (and forget about all
other files for that page)
- Else if there's only one image, take it (and forget about all other
- Else if there is a single image file, take it (and forget about all other
files for that page)
- Otherwise raise an error (complaining that only PAGE-XML warrants

having multiple images for a single page)
(https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593)
"""
ret = self.workspace.mets.find_all_files(
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype=MIMETYPE_PAGE)
if ret:
return ret
ret = self.workspace.mets.find_all_files(
fileGrp=self.input_file_grp, pageId=self.page_id, mimetype="//image/.*")
if self.page_id and len(ret) > 1:
raise ValueError("No PAGE-XML %s in fileGrp '%s' but multiple images." % (
"for page '%s'" % self.page_id if self.page_id else '',
self.input_file_grp
))
return ret
if not self.input_file_grp:
raise ValueError("Processor is missing input fileGrp")
ret = self.zip_input_files(mimetype=None, on_error='abort')
if not ret:
return []
assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps'
return [tuples[0] for tuples in ret]

def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
"""
List tuples of input files (for multiple input file groups).

Processors that expect/need multiple input file groups,
cannot use ``input_files``. They must align (zip) input files
across pages. This includes the case where not all pages
are equally present in all file groups. It also requires
making a consistent selection if there are multiple files
per page.

Following the OCR-D functional model, this function tries to
find a single PAGE file per page, or fall back to a single
image file per page. In either case, multiple matches per page
are an error (see error handling below).
This default behaviour can be changed by using a fixed MIME
type filter via ``mimetype``. But still, multiple matching
files per page are an error.

Single-page multiple-file errors are handled according to
``on_error``:
- if ``skip``, then the page for the respective fileGrp will be
silently skipped (as if there was no match at all)
- if ``first``, then the first matching file for the page will be
silently selected (as if the first was the only match)
- if ``last``, then the last matching file for the page will be
silently selected (as if the last was the only match)
- if ``abort``, then an exception will be raised.
Multiple matches for PAGE-XML will always raise an exception.

Args:
require_first (bool): If true, then skip a page entirely
whenever it is not available in the first input fileGrp.

mimetype (str): If not None, filter by the specified MIME
type (literal or regex prefixed by ``//``.
Otherwise prefer PAGE or image.
"""
if not self.input_file_grp:
raise ValueError("Processor is missing input fileGrp")

LOG = getLogger('ocrd.processor.base')
ifgs = self.input_file_grp.split(",")
# Iterating over all files repeatedly may seem inefficient at first sight,
# but the unnecessary OcrdFile instantiations for posterior fileGrp filtering
# can actually be much more costly than traversing the ltree.
# This might depend on the number of pages vs number of fileGrps.

pages = dict()
for i, ifg in enumerate(ifgs):
for file_ in sorted(self.workspace.mets.find_all_files(
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
# sort by MIME type so PAGE comes before images
key=lambda file_: file_.mimetype):
if not file_.pageId:
continue
ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
if ift[i]:
LOG.debug("another page %s in input file group %s", file_.pageId, ifg)
# fileGrp has multiple files for this page ID
if mimetype:
# filter was active, this must not happen
if on_error == 'skip':
ift[i] = None
elif on_error == 'first':
pass # keep first match
elif on_error == 'last':
ift[i] = file_
elif on_error == 'abort':
raise ValueError(
"Multiple '%s' matches for page '%s' in fileGrp '%s'." % (
mimetype, file_.pageId, ifg))
else:
raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
elif (ift[i].mimetype == MIMETYPE_PAGE and
file_.mimetype != MIMETYPE_PAGE):
pass # keep PAGE match
elif (ift[i].mimetype == MIMETYPE_PAGE and
file_.mimetype == MIMETYPE_PAGE):
raise ValueError(
"Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % (
file_.pageId, ifg))
else:
# filter was inactive but no PAGE is in control, this must not happen
if on_error == 'skip':
ift[i] = None
elif on_error == 'first':
pass # keep first match
elif on_error == 'last':
ift[i] = file_
elif on_error == 'abort':
raise ValueError(
"No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % (
file_.pageId, ifg))
else:
raise Exception("Unknown 'on_error' strategy '%s'" % on_error)
else:
LOG.debug("adding page %s to input file group %s", file_.pageId, ifg)
ift[i] = file_
ifts = list()
for page, ifiles in pages.items():
for i, ifg in enumerate(ifgs):
if not ifiles[i]:
# other fallback options?
LOG.error('found no page %s in file group %s',
page, ifg)
if ifiles[0] or not require_first:
ifts.append(tuple(ifiles))
return ifts
6 changes: 6 additions & 0 deletions ocrd/ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ def resolve_image_exif(self, image_url):
Return
:class:`OcrdExif`
"""
if not image_url:
# avoid "finding" just any file
raise Exception("Cannot resolve empty image path")
f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
image_filename = self.download_file(f).local_filename
ocrd_exif = exif_from_filename(image_filename)
Expand All @@ -286,6 +289,9 @@ def _resolve_image_as_pil(self, image_url, coords=None):
Image or region in image as PIL.Image

"""
if not image_url:
# avoid "finding" just any file
raise Exception("Cannot resolve empty image path")
log = getLogger('ocrd.workspace._resolve_image_as_pil')
f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
image_filename = self.download_file(f).local_filename
Expand Down
96 changes: 91 additions & 5 deletions tests/processor/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

from tempfile import TemporaryDirectory
from os.path import join
from tests.base import TestCase, assets, main # pylint: disable=import-error, no-name-in-module
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, IncompleteProcessor, DUMMY_TOOL

from ocrd_utils import MIMETYPE_PAGE
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
from ocrd.resolver import Resolver
from ocrd.processor.base import Processor, run_processor, run_cli

class TestProcessor(TestCase):

def setUp(self):
disableLogging()
initLogging()
self.resolver = Resolver()
self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))

Expand All @@ -28,9 +30,19 @@ def test_no_mets_url(self):
with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'):
run_processor(DummyProcessor, resolver=self.resolver)

def test_no_input_file_grp(self):
processor = run_processor(DummyProcessor,
resolver=self.resolver,
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'):
_ = processor.input_files

def test_with_mets_url_input_files(self):
processor = run_processor(DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
self.assertEqual(len(processor.input_files), 20)
processor = run_processor(DummyProcessor,
input_file_grp='OCR-D-SEG-PAGE',
resolver=self.resolver,
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'))
self.assertEqual(len(processor.input_files), 2)
self.assertTrue(all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))

def test_parameter(self):
Expand All @@ -42,10 +54,11 @@ def test_parameter(self):
processor = run_processor(
DummyProcessor,
parameter=json.load(f),
input_file_grp="OCR-D-IMG",
resolver=self.resolver,
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')
)
self.assertEqual(len(processor.input_files), 20)
self.assertEqual(len(processor.input_files), 3)

def test_verify(self):
proc = DummyProcessor(self.workspace)
Expand Down Expand Up @@ -89,5 +102,78 @@ def test_run_cli(self):
resolver=Resolver(),
)

def test_zip_input_files(self):
class ZipTestProcessor(Processor): pass
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001')
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002')
for page_id in [None, 'phys_0001,phys_0002']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
assert ('foobar1', 'foobar2') in tuples
assert ('foobar3', 'foobar4') in tuples
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
assert ('foobar1', None) in tuples
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
assert ('foobar1', 'foobar2') in tuples
assert ('foobar3', 'foobar4') in tuples

def test_zip_input_files_multi_mixed(self):
class ZipTestProcessor(Processor): pass
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001')
ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001')
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002')
ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002')
for page_id in [None, 'phys_0001,phys_0002']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
print("unfiltered")
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
assert ('foobar1', 'foobar2') in tuples
assert ('foobar3', 'foobar4') in tuples
print("PAGE-filtered")
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
assert ('foobar3', None) in tuples
ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002')
for page_id in [None, 'phys_0001,phys_0002']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
assert ('foobar1', 'foobar2') in tuples
assert ('foobar3', 'foobar4') in tuples
tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
assert ('foobar3', None) in tuples
with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
tuples = proc.zip_input_files(on_error='abort')
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001')
for page_id in [None, 'phys_0001,phys_0002']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
tuples = proc.zip_input_files()

def test_zip_input_files_require_first(self):
class ZipTestProcessor(Processor): pass
self.capture_out_err()
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None)
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001')
for page_id in [None, 'phys_0001,phys_0002']:
with self.subTest(page_id=page_id):
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
r = self.capture_out_err()
assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err


if __name__ == "__main__":
main(__file__)