Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def set_builder(self):
self._builder = builders.WordBoxBuilder()
sys.exit()
print tab(), subheader('loading word boxes from'), highlight(args.from_csv)
df = pd.read_csv(args.from_csv)
else:
# yes, does the input file exist?
if (os.path.exists(inputFile) == False):
# no, exit
print tab(), warning('Cannot find input file {0}'.format(inputDir))
sys.exit()
# perform the OCR
print tab(), subheader('Performing OCR'), 'on file ', highlight(inputFile)
wordBoxes = tool.image_to_string(
Image.open(os.path.join(inputFile)),
lang=lang,
builder=pyocr.builders.WordBoxBuilder()
)
print tab(), done()
# load into pandas
print tab(), subheader('Loading word boxes'), 'into pandas...'
df = pd.DataFrame(columns=['text', 'x0', 'y0', 'x1', 'y1'])
i = 0
for box in wordBoxes:
df.loc[i] = [
box.content.encode('utf-8'),
box.position[0][0],
box.position[0][1],
box.position[1][0],
box.position[1][1]
]
i += 1
"""
Get all the word boxes of this page.
"""
boxfile = self.__box_path
try:
box_builder = pyocr.builders.LineBoxBuilder()
with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
boxes = box_builder.read_file(file_desc)
if boxes != []:
return boxes
# fallback: old format: word boxes
# shouldn't be used anymore ...
logger.warning("WARNING: Doc %s uses old box format" %
(str(self.doc)))
box_builder = pyocr.builders.WordBoxBuilder()
with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
boxes = box_builder.read_file(file_desc)
return boxes
except IOError, exc:
logger.error("Unable to get boxes for '%s': %s"
% (self.doc.docid, exc))
return []
def __get_boxes(self):
"""
Get all the word boxes of this page.
"""
boxfile = self.__box_path
try:
box_builder = pyocr.builders.LineBoxBuilder()
with self.fs.open(boxfile, 'r') as file_desc:
boxes = box_builder.read_file(file_desc)
if boxes != []:
return boxes
# fallback: old format: word boxes
# shouldn't be used anymore ...
box_builder = pyocr.builders.WordBoxBuilder()
with self.fs.open(boxfile, 'r') as file_desc:
boxes = box_builder.read_file(file_desc)
if len(boxes) <= 0:
return []
logger.warning("WARNING: Doc %s uses old box format" %
(str(self.doc)))
return [pyocr.builders.LineBox(boxes, boxes[0].position)]
except IOError as exc:
logger.error("Unable to get boxes for '%s': %s"
% (self.doc.docid, exc))
return []