How to use the ocrd.Processor function in ocrd

To help you get started, we’ve selected a few ocrd examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_table.py View on Github external
RegionRefIndexedType,
    OrderedGroupType,
    OrderedGroupIndexedType,
    UnorderedGroupType,
    UnorderedGroupIndexedType,
    ReadingOrderType
)
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .recognize import page_get_reading_order

TOOL = 'ocrd-tesserocr-segment-table'
LOG = getLogger('processor.TesserocrSegmentTable')

class TesserocrSegmentTable(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentTable, self).__init__(*args, **kwargs)

    def process(self):
        """Performs table cell segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the block level
        for table regions. If ``overwrite_regions`` is enabled and any
        layout annotation already exists inside, then remove it.
        
        Set up Tesseract to detect text blocks (as table cells).
        (This is not Tesseract's internal table structure recognition,
github OCR-D / ocrd_segment / ocrd_segment / extract_regions.py View on Github external
MIME_TO_EXT
)
from ocrd_models.ocrd_page import (
    LabelsType, LabelType,
    MetadataItemType
)
from ocrd_modelfactory import page_from_file
from ocrd import Processor

from .config import OCRD_TOOL
from .extract_pages import CLASSES

TOOL = 'ocrd-segment-extract-regions'
LOG = getLogger('processor.ExtractRegions')

class ExtractRegions(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(ExtractRegions, self).__init__(*args, **kwargs)

    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_region.py View on Github external
SeparatorRegionType,
    NoiseRegionType,
    to_xml)
from ocrd_models.ocrd_page_generateds import (
    TableRegionType,
    TextLineType,
    TextTypeSimpleType
)
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-segment-region'
LOG = getLogger('processor.TesserocrSegmentRegion')

class TesserocrSegmentRegion(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentRegion, self).__init__(*args, **kwargs)

    def process(self):
        """Performs region segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        and remove any existing Region and ReadingOrder elements
        (unless ``overwrite_regions`` is False).
        
        Set up Tesseract to detect blocks, and add each one to the page
        as a region according to BlockType at the detected coordinates.
        If ``find_tables`` is True, try to detect table blocks and add them
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / binarize.py View on Github external
from ocrd_models.ocrd_page import (
    MetadataItemType,
    LabelsType, LabelType,
    AlternativeImageType,
    TextRegionType,
    to_xml
)
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')
FALLBACK_IMAGE_GRP = 'OCR-D-IMG-BIN'

class TesserocrBinarize(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrBinarize, self).__init__(*args, **kwargs)

    def process(self):
        """Performs binarization of the region / line with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested level.
        
        Set up Tesseract to recognize the segment image's layout, and get
        the binarized image. Create an image file, and reference it as
        AlternativeImage in the element and as file with a fileGrp USE
        equal `OCR-D-IMG-BIN` in the workspace.
github OCR-D / ocrd_segment / ocrd_segment / extract_pages.py View on Github external
'GraphicRegion:stamp':                  '008000DC',
    'GraphicRegion:signature':              '008000D7',
    'GraphicRegion:barcode':                '008000D2',
    'GraphicRegion:paper-grow':             '008000CD',
    'GraphicRegion:punch-hole':             '008000C8',
    'GraphicRegion:other':                  '008000C3',
    'ImageRegion':                          '00CED1FF',
    'LineDrawingRegion':                    'B8860BFF',
    'MathsRegion':                          '00BFFFFF',
    'NoiseRegion':                          'FF0000FF',
    'SeparatorRegion':                      'FF00FFFF',
    'UnknownRegion':                        '646464FF',
    'CustomRegion':                         '637C81FF'}
# pragma pylint: enable=bad-whitespace

class ExtractPages(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(ExtractPages, self).__init__(*args, **kwargs)

    def process(self):
        """Extract page images and region descriptions (type and coordinates) from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Get all regions with their types (region element class), sub-types (@type)
        and coordinates relative to the page (which depending on the workflow could
        already be cropped, deskewed, dewarped, binarized etc). Extract the image of
        the (cropped, deskewed, dewarped) page, both in binarized form (if available)
github OCR-D / ocrd_segment / ocrd_segment / replace_original.py View on Github external
from ocrd_models.ocrd_page import (
    LabelsType, LabelType,
    MetadataItemType,
    TextRegionType,
    to_xml
)
from ocrd_modelfactory import page_from_file
from ocrd import Processor

from .config import OCRD_TOOL

TOOL = 'ocrd-segment-replace-original'
LOG = getLogger('processor.ReplaceOriginal')
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-SUBST'

class ReplaceOriginal(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(ReplaceOriginal, self).__init__(*args, **kwargs)

    def process(self):
        """Extract page image and replace original with it.
        
        Open and deserialize PAGE input files and their respective images,
        then go to the page hierarchy level.
        
        Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
        the last annotated form (which, depending on the workflow, could be
        binarized or raw). Add that image file to the workspace with the fileGrp
        USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``.
github OCR-D / ocrd_segment / ocrd_segment / import_image_segmentation.py View on Github external
MusicRegionType,
    UnknownRegionType,
    TextTypeSimpleType,
    GraphicsTypeSimpleType,
    ChartTypeSimpleType
)
# pragma pylint: enable=unused-import
from ocrd import Processor

from .config import OCRD_TOOL
from .extract_pages import CLASSES

TOOL = 'ocrd-segment-from-masks'
LOG = getLogger('processor.ImportImageSegmentation')

class ImportImageSegmentation(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(ImportImageSegmentation, self).__init__(*args, **kwargs)

    def process(self):
        """Performs region segmentation by reading mask images in pseudo-colour.
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group, as well as mask image file from the second.
        
        Then iterate over all connected (equally colored) mask segments and compute
        convex hull contours for them. Convert them to polygons, and look up their
        color value in ``colordict`` to instantiate the appropriate region types
        (optionally with subtype). Instantiate and annotate regions accordingly.
github OCR-D / ocrd_segment / ocrd_segment / classify_address_text.py View on Github external
LOG.debug("text classification result for '%s' is: %s", text, result.text)
    result = json.loads(result.text)
    # TODO: train visual models for soft input and use result['confidence']
    result = result['resultClass']
    if result != 'ADDRESS_NONE':
        return result
    # try a few other fallbacks
    if '·' in text:
        return classify_address(text.replace('·', ','))
    if ' - ' in text:
        return classify_address(text.replace(' - ', ', '))
    if ' | ' in text:
        return classify_address(text.replace(' | ', ', '))
    return result

class ClassifyAddressText(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(ClassifyAddressText, self).__init__(*args, **kwargs)

    def process(self):
        """Classify text lines belonging to addresses from text recognition results.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the text line level.
        
        Then, get the text results of each line and classify them into
        text belonging to address descriptions and other.
        
        Annotate the class results (name, street, zip, none) via `@custom` descriptor.
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_word.py View on Github external
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
    CoordsType,
    LabelType, LabelsType,
    MetadataItemType,
    WordType,
    to_xml,
)

from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-segment-word'
LOG = getLogger('processor.TesserocrSegmentWord')

class TesserocrSegmentWord(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentWord, self).__init__(*args, **kwargs)

    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / segment_line.py View on Github external
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
    CoordsType,
    LabelType, LabelsType,
    MetadataItemType,
    TextLineType,
    to_xml
)

from .config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-segment-line'
LOG = getLogger('processor.TesserocrSegmentLine')

class TesserocrSegmentLine(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentLine, self).__init__(*args, **kwargs)


    def process(self):
        """Performs (text) line segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the (text) region level,
        and remove any existing TextLine elements (unless ``overwrite_lines``
        is False).
        
        Set up Tesseract to detect lines, and add each one to the region