Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
RegionRefIndexedType,
OrderedGroupType,
OrderedGroupIndexedType,
UnorderedGroupType,
UnorderedGroupIndexedType,
ReadingOrderType
)
from ocrd import Processor
from .config import TESSDATA_PREFIX, OCRD_TOOL
from .recognize import page_get_reading_order
TOOL = 'ocrd-tesserocr-segment-table'
LOG = getLogger('processor.TesserocrSegmentTable')
class TesserocrSegmentTable(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrSegmentTable, self).__init__(*args, **kwargs)
def process(self):
"""Performs table cell segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the block level
for table regions. If ``overwrite_regions`` is enabled and any
layout annotation already exists inside, then remove it.
Set up Tesseract to detect text blocks (as table cells).
(This is not Tesseract's internal table structure recognition,
MIME_TO_EXT
)
from ocrd_models.ocrd_page import (
LabelsType, LabelType,
MetadataItemType
)
from ocrd_modelfactory import page_from_file
from ocrd import Processor
from .config import OCRD_TOOL
from .extract_pages import CLASSES
TOOL = 'ocrd-segment-extract-regions'
LOG = getLogger('processor.ExtractRegions')
class ExtractRegions(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(ExtractRegions, self).__init__(*args, **kwargs)
def process(self):
"""Extract region images from the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the region level.
Extract an image for each region (which depending on the workflow
can already be deskewed, dewarped, binarized etc.), cropped to its
minimal bounding box, and masked by the coordinate polygon outline.
If ``transparency`` is true, then also add an alpha channel which is
SeparatorRegionType,
NoiseRegionType,
to_xml)
from ocrd_models.ocrd_page_generateds import (
TableRegionType,
TextLineType,
TextTypeSimpleType
)
from ocrd import Processor
from .config import TESSDATA_PREFIX, OCRD_TOOL
TOOL = 'ocrd-tesserocr-segment-region'
LOG = getLogger('processor.TesserocrSegmentRegion')
class TesserocrSegmentRegion(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrSegmentRegion, self).__init__(*args, **kwargs)
def process(self):
"""Performs region segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
and remove any existing Region and ReadingOrder elements
(unless ``overwrite_regions`` is False).
Set up Tesseract to detect blocks, and add each one to the page
as a region according to BlockType at the detected coordinates.
If ``find_tables`` is True, try to detect table blocks and add them
from ocrd_models.ocrd_page import (
MetadataItemType,
LabelsType, LabelType,
AlternativeImageType,
TextRegionType,
to_xml
)
from ocrd import Processor
from .config import TESSDATA_PREFIX, OCRD_TOOL
TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')
FALLBACK_IMAGE_GRP = 'OCR-D-IMG-BIN'
class TesserocrBinarize(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrBinarize, self).__init__(*args, **kwargs)
def process(self):
"""Performs binarization of the region / line with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested level.
Set up Tesseract to recognize the segment image's layout, and get
the binarized image. Create an image file, and reference it as
AlternativeImage in the element and as file with a fileGrp USE
equal `OCR-D-IMG-BIN` in the workspace.
'GraphicRegion:stamp': '008000DC',
'GraphicRegion:signature': '008000D7',
'GraphicRegion:barcode': '008000D2',
'GraphicRegion:paper-grow': '008000CD',
'GraphicRegion:punch-hole': '008000C8',
'GraphicRegion:other': '008000C3',
'ImageRegion': '00CED1FF',
'LineDrawingRegion': 'B8860BFF',
'MathsRegion': '00BFFFFF',
'NoiseRegion': 'FF0000FF',
'SeparatorRegion': 'FF00FFFF',
'UnknownRegion': '646464FF',
'CustomRegion': '637C81FF'}
# pragma pylint: enable=bad-whitespace
class ExtractPages(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(ExtractPages, self).__init__(*args, **kwargs)
def process(self):
"""Extract page images and region descriptions (type and coordinates) from the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the region level.
Get all regions with their types (region element class), sub-types (@type)
and coordinates relative to the page (which depending on the workflow could
already be cropped, deskewed, dewarped, binarized etc). Extract the image of
the (cropped, deskewed, dewarped) page, both in binarized form (if available)
from ocrd_models.ocrd_page import (
LabelsType, LabelType,
MetadataItemType,
TextRegionType,
to_xml
)
from ocrd_modelfactory import page_from_file
from ocrd import Processor
from .config import OCRD_TOOL
TOOL = 'ocrd-segment-replace-original'
LOG = getLogger('processor.ReplaceOriginal')
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-SUBST'
class ReplaceOriginal(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(ReplaceOriginal, self).__init__(*args, **kwargs)
def process(self):
"""Extract page image and replace original with it.
Open and deserialize PAGE input files and their respective images,
then go to the page hierarchy level.
Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
the last annotated form (which, depending on the workflow, could be
binarized or raw). Add that image file to the workspace with the fileGrp
USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``.
MusicRegionType,
UnknownRegionType,
TextTypeSimpleType,
GraphicsTypeSimpleType,
ChartTypeSimpleType
)
# pragma pylint: enable=unused-import
from ocrd import Processor
from .config import OCRD_TOOL
from .extract_pages import CLASSES
TOOL = 'ocrd-segment-from-masks'
LOG = getLogger('processor.ImportImageSegmentation')
class ImportImageSegmentation(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(ImportImageSegmentation, self).__init__(*args, **kwargs)
def process(self):
"""Performs region segmentation by reading mask images in pseudo-colour.
Open and deserialize each PAGE input file (or generate from image input file)
from the first input file group, as well as mask image file from the second.
Then iterate over all connected (equally colored) mask segments and compute
convex hull contours for them. Convert them to polygons, and look up their
color value in ``colordict`` to instantiate the appropriate region types
(optionally with subtype). Instantiate and annotate regions accordingly.
LOG.debug("text classification result for '%s' is: %s", text, result.text)
result = json.loads(result.text)
# TODO: train visual models for soft input and use result['confidence']
result = result['resultClass']
if result != 'ADDRESS_NONE':
return result
# try a few other fallbacks
if '·' in text:
return classify_address(text.replace('·', ','))
if ' - ' in text:
return classify_address(text.replace(' - ', ', '))
if ' | ' in text:
return classify_address(text.replace(' | ', ', '))
return result
class ClassifyAddressText(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(ClassifyAddressText, self).__init__(*args, **kwargs)
def process(self):
"""Classify text lines belonging to addresses from text recognition results.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the text line level.
Then, get the text results of each line and classify them into
text belonging to address descriptions and other.
Annotate the class results (name, street, zip, none) via `@custom` descriptor.
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
LabelType, LabelsType,
MetadataItemType,
WordType,
to_xml,
)
from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL
TOOL = 'ocrd-tesserocr-segment-word'
LOG = getLogger('processor.TesserocrSegmentWord')
class TesserocrSegmentWord(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrSegmentWord, self).__init__(*args, **kwargs)
def process(self):
"""Performs word segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the textline level,
and remove any existing Word elements (unless ``overwrite_words``
is False).
Set up Tesseract to detect words, and add each one to the line
at the detected coordinates.
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
CoordsType,
LabelType, LabelsType,
MetadataItemType,
TextLineType,
to_xml
)
from .config import TESSDATA_PREFIX, OCRD_TOOL
TOOL = 'ocrd-tesserocr-segment-line'
LOG = getLogger('processor.TesserocrSegmentLine')
class TesserocrSegmentLine(Processor):
def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrSegmentLine, self).__init__(*args, **kwargs)
def process(self):
"""Performs (text) line segmentation with Tesseract on the workspace.
Open and deserialize PAGE input files and their respective images,
then iterate over the element hierarchy down to the (text) region level,
and remove any existing TextLine elements (unless ``overwrite_lines``
is False).
Set up Tesseract to detect lines, and add each one to the region