Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def set_builder(self):
self._builder = builders.DigitLineBoxBuilder()
def __get_boxes(self):
"""
Get all the word boxes of this page.
"""
boxfile = self.__box_path
try:
box_builder = pyocr.builders.LineBoxBuilder()
with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
boxes = box_builder.read_file(file_desc)
if boxes != []:
return boxes
# fallback: old format: word boxes
# shouldn't be used anymore ...
logger.warning("WARNING: Doc %s uses old box format" %
(str(self.doc)))
box_builder = pyocr.builders.WordBoxBuilder()
with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
boxes = box_builder.read_file(file_desc)
return boxes
except IOError, exc:
logger.error("Unable to get boxes for '%s': %s"
% (self.doc.docid, exc))
return []
def __get_boxes(self):
"""
Get all the word boxes of this page.
"""
if self.__boxes is not None:
return self.__boxes
# Check first if there is an OCR file available
boxfile = self.__get_box_path()
if self.fs.exists(boxfile):
box_builder = pyocr.builders.LineBoxBuilder()
try:
with self.fs.open(boxfile, 'r') as file_desc:
self.__boxes = box_builder.read_file(file_desc)
return self.__boxes
except IOError as exc:
logger.error("Unable to get boxes for '%s': %s"
% (self.doc.docid, exc))
# will fall back on pdf boxes
# fall back on what libpoppler tells us
txt = self.pdf_page.get_text()
self.__boxes = []
layout = self.pdf_page.get_text_layout()
def __get_boxes(self):
"""
Get all the word boxes of this page.
"""
if self.__boxes is not None:
return self.__boxes
# Check first if there is an OCR file available
boxfile = self.__get_box_path()
try:
os.stat(boxfile)
box_builder = pyocr.builders.LineBoxBuilder()
try:
with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
self.__boxes = box_builder.read_file(file_desc)
return self.__boxes
except IOError, exc:
logger.error("Unable to get boxes for '%s': %s"
% (self.doc.docid, exc))
# will fall back on pdf boxes
except OSError, exc: # os.stat() failed
pass
# fall back on what libpoppler tells us
# TODO: Line support !
langs = best_tool.get_available_languages()
if py_language_code not in langs:
# if we use Tesseract, then this means copying the necessary language files from
# https://github.com/tesseract-ocr/tessdatainstalling to tessdata, which is
# usually located at /usr/share/tessdata or similar, but there's no API to query
# the exact location, so we cannot, for now, give a better message.
evaluation.message('TextRecognize', 'lang', py_language, best_tool.get_name())
return
import pyocr.builders
text = best_tool.image_to_string(
image.pil(),
lang=py_language_code,
builder=pyocr.builders.TextBuilder())
if isinstance(text, (list, tuple)):
text = '\n'.join(text)
return String(text)
cols = [(cols[i] - k[0], cols[i + 1] - k[0]) for i in range(0, len(cols) - 1)]
y_cuts = find_cuts(table_image, char_scale=self.char_scale)
rows = [(y_cuts[i], y_cuts[i + 1]) for i in range(0, len(y_cuts) - 1)]
table = Table(cols, rows)
for i in range(len(table.cells)):
for j in range(len(table.cells[i])):
x1 = int(table.cells[i][j].x1)
y1 = int(table.cells[i][j].y1)
x2 = int(table.cells[i][j].x2)
y2 = int(table.cells[i][j].y2)
table.cells[i][j].image = table_image[y1:y2,x1:x2]
cell_image = Image.fromarray(table.cells[i][j].image)
text = self.tool.image_to_string(
cell_image,
lang=self.lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=self.layout)
)
table.cells[i][j].add_text(text)
ar = table.get_list()
ar.reverse()
ar = encode_list(ar)
table_data['data'] = ar
tables['table-{0}'.format(table_no + 1)] = table_data
table_no += 1
page[os.path.basename(bname)] = tables
return page