How to use pytesseract - 10 common examples

To help you get started, we’ve selected a few pytesseract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_image_to_data_common_output(test_file, output):
    """Test and compare the type of the result."""
    result = image_to_data(test_file, output_type=output)
    expected_keys = [
        'level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num',
        'left', 'top', 'width', 'height', 'conf', 'text'
    ]

    if output is Output.BYTES:
        assert isinstance(result, bytes)

    elif output is Output.DICT:
        assert isinstance(result, dict)
        assert bool(set(result.keys()).intersection(expected_keys))

    elif output is Output.STRING:
        assert isinstance(result, unicode if IS_PYTHON_2 else str)
        for key in expected_keys:
            assert key in result
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_wrong_tesseract_cmd(test_file, test_path):
    """Test wrong or missing tesseract command."""
    import pytesseract
    pytesseract.pytesseract.tesseract_cmd = test_path
    with pytest.raises(TesseractNotFoundError):
        pytesseract.pytesseract.image_to_string(test_file)
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'  # restore the def value
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_wrong_tesseract_cmd(test_file, test_path):
    """Test wrong or missing tesseract command."""
    import pytesseract
    pytesseract.pytesseract.tesseract_cmd = test_path
    with pytest.raises(TesseractNotFoundError):
        pytesseract.pytesseract.image_to_string(test_file)
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'  # restore the def value
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_wrong_tesseract_cmd(test_file, test_path):
    """Test wrong or missing tesseract command."""
    import pytesseract
    pytesseract.pytesseract.tesseract_cmd = test_path
    with pytest.raises(TesseractNotFoundError):
        pytesseract.pytesseract.image_to_string(test_file)
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'  # restore the def value
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_proper_oserror_exception_handling(test_file, test_path):
    """"Test for bubbling up OSError exceptions."""
    import pytesseract
    pytesseract.pytesseract.tesseract_cmd = test_path
    with pytest.raises(
        TesseractNotFoundError if IS_PYTHON_2 and test_path else OSError
    ):
        pytesseract.pytesseract.image_to_string(test_file)
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'  # restore the def value
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
if numpy_installed:
    import numpy as np

if pandas_installed:
    import pandas

try:
    from PIL import Image
except ImportError:
    import Image


IS_PYTHON_2 = version_info[:1] < (3, )
IS_PYTHON_3 = not IS_PYTHON_2

TESSERACT_VERSION = tuple(get_tesseract_version().version)  # to skip tests

DATA_DIR = path.join(path.dirname(path.abspath(__file__)), 'data')
TEST_JPEG = path.join(DATA_DIR, 'test.jpg')

pytestmark = pytest.mark.pytesseract  # used marker for the module


@pytest.fixture(scope='session')
def test_file():
    return TEST_JPEG


@pytest.fixture(scope='session')
def test_file_european():
    return path.join(DATA_DIR, 'test-european.jpg')
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_image_to_pdf_or_hocr(test_file, extension):
    result = image_to_pdf_or_hocr(test_file, extension=extension)

    if extension is 'pdf':
        if IS_PYTHON_2:
            assert isinstance(result, str)
            result = str(result).strip()
            assert result.startswith('%PDF')
            assert result.endswith('EOF')
        else:
            assert isinstance(result, bytes)

    if extension is 'hocr':
        assert isinstance(result, bytes)  # type
        result = result.decode('utf-8') if IS_PYTHON_2 else str(result, 'utf-8')
        result = str(result).strip()
        assert result.startswith('')
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
        Output.DICT,
        Output.STRING,
    ],
    ids=[
        'bytes',
        'dict',
        'string',
    ]
)
def test_image_to_data_common_output(test_file, output):
    """Test and compare the type of the result."""
    result = image_to_data(test_file, output_type=output)
    expected_keys = [
        'level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num',
        'left', 'top', 'width', 'height', 'conf', 'text'
    ]
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_image_to_data__pandas_support(test_file):
    with pytest.raises(TSVNotSupported):
        image_to_data(test_file, output_type=Output.DATAFRAME)
github madmaze / pytesseract / tests / test_pytesseract.py View on Github external
def test_image_to_data_common_output(test_file, output):
    """Test and compare the type of the result."""
    result = image_to_data(test_file, output_type=output)
    expected_keys = [
        'level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num',
        'left', 'top', 'width', 'height', 'conf', 'text'
    ]

    if output is Output.BYTES:
        assert isinstance(result, bytes)

    elif output is Output.DICT:
        assert isinstance(result, dict)
        assert bool(set(result.keys()).intersection(expected_keys))

    elif output is Output.STRING:
        assert isinstance(result, unicode if IS_PYTHON_2 else str)
        for key in expected_keys:
            assert key in result