How to use the ocrd.processor.base.Processor function in ocrd

To help you get started, we’ve selected a few ocrd examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github OCR-D / core / tests / processor / test_processor.py View on Github external
print(json.dumps(self.parameter))

class DummyProcessorWithRequiredParameters(Processor):
    def process(self): pass
    def __init__(self, *args, **kwargs):
        kwargs['version'] = '0.0.1'
        kwargs['ocrd_tool'] = {
            'executable': 'ocrd-test',
            'steps': ['recognition/post-correction'],
            'parameters': {
                'i-am-required': {'required': True}
            }
        }
        super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)

class IncompleteProcessor(Processor):
    pass

class TestProcessor(TestCase):

    def setUp(self):
        self.resolver = Resolver()
        self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))

    def test_incomplete_processor(self):
        proc = IncompleteProcessor(None)
        with self.assertRaisesRegex(Exception, 'Must be implemented'):
            proc.process()

    def test_no_resolver(self):
        with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'):
            run_processor(DummyProcessor)
github OCR-D / core / tests / processor / test_processor.py View on Github external
def test_params(self):
        proc = Processor(workspace=self.workspace)
        self.assertEqual(proc.parameter, {})
github OCR-D / core / tests / processor / test_processor.py View on Github external
'default': 'bla'
        }
    }
}

class DummyProcessor(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = DUMMY_TOOL
        kwargs['version'] = '0.0.1'
        super(DummyProcessor, self).__init__(*args, **kwargs)

    def process(self):
        print(json.dumps(self.parameter))

class DummyProcessorWithRequiredParameters(Processor):
    def process(self): pass
    def __init__(self, *args, **kwargs):
        kwargs['version'] = '0.0.1'
        kwargs['ocrd_tool'] = {
            'executable': 'ocrd-test',
            'steps': ['recognition/post-correction'],
            'parameters': {
                'i-am-required': {'required': True}
            }
        }
        super(DummyProcessorWithRequiredParameters, self).__init__(*args, **kwargs)

class IncompleteProcessor(Processor):
    pass

class TestProcessor(TestCase):
github OCR-D / core / tests / processor / test_processor.py View on Github external
from ocrd.resolver import Resolver
from ocrd.processor.base import Processor, run_processor, run_cli

DUMMY_TOOL = {
    'executable': 'ocrd-test',
    'steps': ['recognition/post-correction'],
    'parameters': {
        'baz': {
            'type': 'string',
            'default': 'bla'
        }
    }
}

class DummyProcessor(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = DUMMY_TOOL
        kwargs['version'] = '0.0.1'
        super(DummyProcessor, self).__init__(*args, **kwargs)

    def process(self):
        print(json.dumps(self.parameter))

class DummyProcessorWithRequiredParameters(Processor):
    def process(self): pass
    def __init__(self, *args, **kwargs):
        kwargs['version'] = '0.0.1'
        kwargs['ocrd_tool'] = {
            'executable': 'ocrd-test',
            'steps': ['recognition/post-correction'],
github OCR-D / core / ocrd / processor / segment_region / tesserocr.py View on Github external
from __future__ import absolute_import
import os

from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX

import tesserocr

log = getLogger('Tesseract3RegionSegmenter')

class Tesseract3RegionSegmenter(Processor):

    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page = OcrdPage.from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(page.imageFileName)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    box, index = component[1], component[2]
                    # the region reference in the reading order element
                    ID = "r%i" % index
                    page.add_reading_order_ref(ID, index)
github OCR-D / core / ocrd / processor / characterize / exif.py View on Github external
# -*- coding: utf-8 -*-
from __future__ import absolute_import
#  import re
import exiftool

from ocrd.constants import EXIF_COMPRESSION_METHODS, EXIF_PHOTOMETRICINTERPRETATION_VALUES, EXIF_RESOLUTIONUNIT_VALUES
from ocrd.processor.base import Processor
from ocrd.model.ocrd_page import OcrdPage

class ExifProcessor(Processor):
    """
    Extracts image meta data.
    """

    def verify(self):
        """
        Ensure that the output is only pages
        """
        return True

    def process(self):
        """
        Performs the image characterization.
        """
        with exiftool.ExifTool() as et:
            for input_file in self.workspace.mets.find_files(fileGrp='INPUT'):
github OCR-D / core / ocrd / processor / segment_line / tesserocr.py View on Github external
from __future__ import absolute_import

from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX

import tesserocr

log = getLogger('processor.segment_line.tesserocr')

class Tesseract3LineSegmenter(Processor):

    def process(self):
        """
        Performs the line segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page = OcrdPage.from_file(self.workspace.download_file(input_file))
                image_url = page.imageFileName
                for region in page.list_textregions():
                    log.debug("Detecting lines in %s with tesseract", region)
                    image = self.workspace.resolve_image_as_pil(image_url, region.coords)
                    tessapi.SetImage(image)
                    for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
                        region.add_textline(coords=component[1])
                self.add_output_file(
github OCR-D / core / ocrd / processor / recognize / tesserocr.py View on Github external
from __future__ import absolute_import

from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX

import tesserocr

log = getLogger('processor.Tesseract3Recognizer')

DEFAULT_MODEL = tesserocr.get_languages()[1][-1]

class Tesseract3Recognizer(Processor):

    def process(self):
        """
        Performs the (text) recognition.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
            log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
            tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
            for (n, input_file) in enumerate(self.input_files):
                log.info("INPUT FILE %i / %s", n, input_file)
                self.workspace.download_file(input_file)
                page = OcrdPage.from_file(input_file)
                image_url = page.imageFileName
                log.info("page %s", page)
                for region in page.list_textregions():
                    textlines = region.list_textlines()