How to use the pyocr.builders.WordBoxBuilder function in pyocr

To help you get started, we’ve selected a few pyocr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openpaperwork / pyocr / tests / tests_base.py View on Github external
def set_builder(self):
        self._builder = builders.WordBoxBuilder()
github DomWeldon / tabular-ocr / eda-scripts / image-detect-columns-v00.py View on Github external
sys.exit()
    print tab(), subheader('loading word boxes from'), highlight(args.from_csv)
    df                              =   pd.read_csv(args.from_csv)
else:
    # yes, does the input file exist?
    if (os.path.exists(inputFile) == False):
        # no, exit
        print tab(), warning('Cannot find input file {0}'.format(inputDir))
        sys.exit()

    # perform the OCR
    print tab(), subheader('Performing OCR'), 'on file ', highlight(inputFile)
    wordBoxes                          =   tool.image_to_string(
        Image.open(os.path.join(inputFile)),
        lang=lang,
        builder=pyocr.builders.WordBoxBuilder()
    )
    print tab(), done()

    # load into pandas
    print tab(), subheader('Loading word boxes'), 'into pandas...'
    df                                  =   pd.DataFrame(columns=['text', 'x0', 'y0', 'x1', 'y1'])
    i                                   =   0
    for box in wordBoxes:
        df.loc[i]                       =   [
            box.content.encode('utf-8'),
            box.position[0][0],
            box.position[0][1],
            box.position[1][0],
            box.position[1][1]
        ]
        i                               +=  1
github openpaperwork / paperwork / src / paperwork / backend / img / page.py View on Github external
"""
        Get all the word boxes of this page.
        """
        boxfile = self.__box_path

        try:
            box_builder = pyocr.builders.LineBoxBuilder()
            with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if boxes != []:
                return boxes
            # fallback: old format: word boxes
            # shouldn't be used anymore ...
            logger.warning("WARNING: Doc %s uses old box format" %
                           (str(self.doc)))
            box_builder = pyocr.builders.WordBoxBuilder()
            with codecs.open(boxfile, 'r', encoding='utf-8') as file_desc:
                boxes = box_builder.read_file(file_desc)
            return boxes
        except IOError, exc:
            logger.error("Unable to get boxes for '%s': %s"
                         % (self.doc.docid, exc))
            return []
github openpaperwork / paperwork / paperwork-backend / paperwork_backend / img / page.py View on Github external
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        boxfile = self.__box_path

        try:
            box_builder = pyocr.builders.LineBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if boxes != []:
                return boxes
            # fallback: old format: word boxes
            # shouldn't be used anymore ...
            box_builder = pyocr.builders.WordBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if len(boxes) <= 0:
                return []
            logger.warning("WARNING: Doc %s uses old box format" %
                           (str(self.doc)))
            return [pyocr.builders.LineBox(boxes, boxes[0].position)]
        except IOError as exc:
            logger.error("Unable to get boxes for '%s': %s"
                         % (self.doc.docid, exc))
            return []