Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import os
from ocrd.constants import DEFAULT_CACHE_FOLDER
from ocrd.utils import getLogger, safe_filename
log = getLogger('ocrd.cache')
class ResolverCache(object):
"""
Cache of downloads, based on URL.
Args:
cache_directory (string): Where to store cached files
"""
def __init__(self, cache_directory=DEFAULT_CACHE_FOLDER):
"""
Instantiate a cache
"""
self.directory = cache_directory
if not os.path.isdir(self.directory):
from ocrd.utils import getLogger
log = getLogger('ocrd.processor')
def run_processor(processor, mets_url=None, resolver=None, workspace=None):
"""
Create a workspace for mets_url and run processor through it
"""
if workspace is None:
if resolver is None:
raise Exception("Need to pass a resolver to create a workspace")
if mets_url is None:
raise Exception("Need to pass mets_url to create a workspace")
workspace = resolver.create_workspace(mets_url)
log.debug("Running processor %s", processor)
processor(workspace).process()
workspace.persist()
class Processor(object):
import json
import re
from jsonschema import Draft4Validator, validators # pylint: disable=import-error
from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX, OCRD_TOOL_SCHEMA
from ocrd.utils import getLogger
log = getLogger('ocrd.validator')
# http://python-jsonschema.readthedocs.io/en/latest/faq/
def extend_with_default(validator_class):
validate_properties = validator_class.VALIDATORS["properties"]
def set_defaults(validator, properties, instance, schema):
for prop, subschema in properties.items():
if "default" in subschema:
instance.setdefault(prop, subschema["default"])
for error in validate_properties(validator, properties, instance, schema):
yield error
return validators.extend(validator_class, {"properties" : set_defaults})
from __future__ import absolute_import
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('processor.Tesseract3Recognizer')
DEFAULT_MODEL = tesserocr.get_languages()[1][-1]
class Tesseract3Recognizer(Processor):
def process(self):
"""
Performs the (text) recognition.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file)
self.workspace.download_file(input_file)
page = OcrdPage.from_file(input_file)
from __future__ import absolute_import
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('processor.segment_line.tesserocr')
class Tesseract3LineSegmenter(Processor):
def process(self):
"""
Performs the line segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image_url = page.imageFileName
for region in page.list_textregions():
log.debug("Detecting lines in %s with tesseract", region)
image = self.workspace.resolve_image_as_pil(image_url, region.coords)
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
from __future__ import absolute_import
import os
from ocrd.model import OcrdPage
from ocrd.processor.base import Processor
from ocrd.utils import getLogger, mets_file_id
from ocrd.constants import MIMETYPE_PAGE, TESSDATA_PREFIX
import tesserocr
log = getLogger('Tesseract3RegionSegmenter')
class Tesseract3RegionSegmenter(Processor):
def process(self):
"""
Performs the region segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image = self.workspace.resolve_image_as_pil(page.imageFileName)
log.debug("Detecting regions with tesseract")
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
box, index = component[1], component[2]
# the region reference in the reading order element