Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def process(self):
"""
Performs the (text) recognition.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
log.info("Using model %s in %s for recognition", tesserocr.get_languages()[0], tesserocr.get_languages()[1][-1])
tessapi.SetPageSegMode(tesserocr.PSM.SINGLE_LINE)
for (n, input_file) in enumerate(self.input_files):
log.info("INPUT FILE %i / %s", n, input_file)
self.workspace.download_file(input_file)
page = OcrdPage.from_file(input_file)
image_url = page.imageFileName
log.info("page %s", page)
for region in page.list_textregions():
textlines = region.list_textlines()
log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.ID)
for (line_no, line) in enumerate(textlines):
log.debug("Recognizing text in region '%s' line '%s'", region.ID, line_no)
# TODO use binarized / gray
image = self.workspace.resolve_image_as_pil(image_url, line.coords)
tessapi.SetImage(image)
line.textequiv = tessapi.GetUTF8Text()
self.add_output_file(
ID=mets_file_id(self.outputGrp, n),
input_file=input_file,
mimetype=MIMETYPE_PAGE,
content=page.to_xml()
def process(self):
"""
Performs the region segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image = self.workspace.resolve_image_as_pil(page.imageFileName)
log.debug("Detecting regions with tesseract")
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
box, index = component[1], component[2]
# the region reference in the reading order element
ID = "r%i" % index
page.add_reading_order_ref(ID, index)
page.add_textregion(ID, box)
self.add_output_file(
ID=mets_file_id(self.outputGrp, n),
input_file=input_file,
mimetype=MIMETYPE_PAGE,
content=page.to_xml()
)
def process(self):
"""
Performs the line segmentation.
"""
with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for (n, input_file) in enumerate(self.input_files):
page = OcrdPage.from_file(self.workspace.download_file(input_file))
image_url = page.imageFileName
for region in page.list_textregions():
log.debug("Detecting lines in %s with tesseract", region)
image = self.workspace.resolve_image_as_pil(image_url, region.coords)
tessapi.SetImage(image)
for component in tessapi.GetComponentImages(tesserocr.RIL.TEXTLINE, True):
region.add_textline(coords=component[1])
self.add_output_file(
ID=mets_file_id(self.outputGrp, n),
input_file=input_file,
mimetype=MIMETYPE_PAGE,
content=page.to_xml()
)