How to use the pyocr.builders.TextBuilder function in pyocr

To help you get started, we’ve selected a few pyocr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openpaperwork / pyocr / tests / tests_tesseract.py View on Github external
def set_builder(self):
        self._builder = builders.TextBuilder()
github elliotkendall / exraidbot / pokeocr.py View on Github external
# Scale up, which oddly helps with OCR
    height, width = image.shape[:2]
    image = cv2.resize(image, (0,0), fx=3, fy=3, interpolation=cv2.INTER_CUBIC)

    # Increase contrast. Must be done before grayscale conversion
    image = cv2utils.increaseContrast(image)

    # Convert to grayscale
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Convert to PIL format
    pil = Image.fromarray(image)

    # OCR the text
    txt = self.tool.image_to_string(pil, lang=self.lang, builder=pyocr.builders.TextBuilder())

    # Replace any non-ASCII unicode characters with their closest
    # equivalents.  This is bad news for i18n, but helps us with a lot of
    # OCR issues
    txt = unicodedata.normalize('NFKD', txt)

    lines = txt.split("\n")

    # Sometimes OCR will insert extra empty lines, so let's strip them out
    newlines = []
    for i in range(len(lines)):
      if not len(lines[i].strip()) == 0:
        newlines.append(lines[i])
    lines = newlines

    if debug:
github aryaminus / memento / mementor / ocr_save.py View on Github external
for f in os.listdir(path): #Return list of files in path directory

            ext = os.path.splitext(f)[1] #Split the pathname path into a pair i.e take .png/ .jpg etc

            if ext.lower() not in VALIDITY: #Convert to lowercase and check in validity list          
                other_files += 1 #Increment if other than validity extension found
                continue

            else:

                count += 1
                image_file_name = path + '/' + f #Full /dir/path/filename.extension

                txt = tool.image_to_string(
                    Im.open(image_file_name), lang=self.lang,
                    builder=pyocr.builders.TextBuilder()
                )
                
                #txt = txt.split()[:5]
                initial = txt.replace('\a', ' ').replace('\b', ' ').replace('\f', ' ').replace('\n',' ').replace('\r', '').replace('\t',' ').replace('\v',' ') #.replace(' ','_') #.replace('.','_') #Replace \n and \t with space
                initial = initial[:60] #Take 1st 100 words
                print('Filename:' + initial + '\n')

                os.chmod(path, 0o777)
                self.savefile(initial, txt, directory_path)

                print(str(count) + (" file" if count == 1 else " files") + " processed")

        if count + other_files == 0:
            print("No files found") #No files found
        else :
            print(str(count) + " / " + str(count + other_files) + " files converted")
github mayan-edms / Mayan-EDMS / mayan / apps / ocr / backends / pyocr.py View on Github external
def execute(self, *args, **kwargs):
        """
        Execute the command line binary of tesseract
        """
        super(PyOCR, self).execute(*args, **kwargs)

        image = Image.open(self.converter.get_page())
        try:
            with c_locale():
                result = self.tool.image_to_string(
                    image,
                    lang=self.language,
                    builder=pyocr.builders.TextBuilder()
                )
        except Exception as exception:
            error_message = (
                'Exception calling pyocr with language option: {}; {}'
            ).format(self.language, exception)

            if self.language not in self.languages:
                error_message = (
                    '{}\nThe requested OCR language "{}" is not '
                    'available and needs to be installed.\n'
                ).format(
                    error_message, self.language
                )

            logger.error(error_message)
            raise OCRError(error_message)
github CarlFK / veyepar / dj / scripts / addimg.py View on Github external
def ocr_img(self, imgname):

        """
        To use a non-standard language pack named foo.traineddata, set the TESSDATA_PREFIX environment variable so the file can be found at TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the argument -l foo.
        """

        tools = pyocr.get_available_tools()
        tool = tools[0]
        text = tool.image_to_string(
            Image.open(imgname),
            lang='eng',
            builder=pyocr.builders.TextBuilder(),
        )

        print(text)

        return text
github cseas / ocr-table / py_ocr.py View on Github external
# alternate approach using pyocr
from PIL import Image
import sys

import pyocr
import pyocr.builders

import codecs

tool = pyocr.get_available_tools()[0]
builder = pyocr.builders.TextBuilder()

txt = tool.image_to_string(
    Image.open('file.tiff'),
    lang="eng",
    builder=builder
)
# txt is a Python string

with codecs.open("toto.txt", 'w', encoding='utf-8') as file_descriptor:
    builder.write_file(file_descriptor, txt)
# toto.txt is a simple text file, encoded in utf-8
github aryaminus / memento / mementor / image_ocr.py View on Github external
def main(self, text_img_name):
        
        txt = tool.image_to_string(
            Im.open(text_img_name), lang=self.lang,
            builder=pyocr.builders.TextBuilder()
        )

        return txt
github ecthros / labelReader / utils / tesseract_ocr.py View on Github external
def ocr(self, images):
		'''Input: images (tuple(area, image))
		Returns the results from Tesseract.'''

		results = []
		for image in images:
			txt = self.tool.image_to_string(image[1], lang=self.langs[0], builder=pyocr.builders.TextBuilder())
			print("==RESULT==" + str(image[0]) + "\n" + txt + "\n==========================")
			results.append((image[0], txt))
		return results