How to use the tesserocr.get_languages function in tesserocr

To help you get started, we’ve selected a few tesserocr examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sirfz / tesserocr / tests / test_api.py View on Github external
def test_detect_os(self):
        """Test DetectOS and DetectOrientationScript (tesseract v4+)."""
        self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY)
        self._api.SetImageFile(self._image_file)
        orientation = self._api.DetectOS()
        all(self.assertIn(k, orientation) for k in ['sconfidence', 'oconfidence', 'script', 'orientation'])
        self.assertEqual(orientation['orientation'], 0)
        languages = tesserocr.get_languages()[1] # this is sorted alphabetically!
        self.assertLess(orientation['script'], len(languages))
        script_name = languages[orientation['script']] # therefore does not work
        #self.assertEqual(script_name, 'Latin') # cannot test: not reliable
        if _TESSERACT_VERSION >= 0x3999800:
            orientation = self._api.DetectOrientationScript()
            all(self.assertIn(k, orientation) for k in ['orient_deg', 'orient_conf', 'script_name', 'script_conf'])
            self.assertEqual(orientation['orient_deg'], 0)
            self.assertEqual(orientation['script_name'], 'Latin')
github alephdata / memorious / memorious / services / ocr.py View on Github external
def get_languages(self, languages):
        if not hasattr(self, 'supported_languages'):
            from tesserocr import get_languages
            _, self.supported_languages = get_languages()
        codes = set(['eng'])
        # for lang in list_to_alpha3(codes):
        #     if lang in self.supported_languages:
        #         codes.add(lang)
        return '+'.join(sorted(codes))
github alephdata / memorious / memorious / services / ocr.py View on Github external
def is_available(cls):
        try:
            from tesserocr import get_languages
            path, languages = get_languages()
            return len(languages) > 0
        except ImportError:
            return False
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / config.py View on Github external
import os
import tesserocr
TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages()[0]
github alephdata / aleph / services / ingest-file / ingestors / support / ocr.py View on Github external
def language_list(self, languages):
        if not hasattr(settings, 'ocr_supported'):
            with temp_locale(TESSERACT_LOCALE):
                # Tesseract language types:
                from tesserocr import get_languages
                _, settings.ocr_supported = get_languages()
                # log.info("OCR languages: %r", settings.ocr_supported)
        models = [c for c in alpha3(languages) if c in settings.ocr_supported]
        if len(models) > self.MAX_MODELS:
            log.warning("Too many models, limit: %s", self.MAX_MODELS)
            models = models[:self.MAX_MODELS]
        models.append('eng')
        return '+'.join(sorted(set(models)))
github alephdata / aleph / services / recognize-text / textrecognizer / recognize.py View on Github external
def __init__(self):
        # Tesseract language types:
        _, self.supported = get_languages()
        self.tl = threading.local()
github occrp-attic / ingestors / ingestors / services / ocr.py View on Github external
def get_languages(self, languages):
        if not hasattr(self, 'supported_languages'):
            from tesserocr import get_languages
            _, self.supported_languages = get_languages()
        codes = set(['eng'])
        for lang in list_to_alpha3(codes):
            if lang in self.supported_languages:
                codes.add(lang)
        return '+'.join(sorted(codes))
github OCR-D / ocrd_tesserocr / ocrd_tesserocr / recognize.py View on Github external
def get_languages(*args, **kwargs):
    """
    Wraps tesserocr.get_languages() with a fixed path parameter.
    """
    return get_languages_(*args, path=TESSDATA_PREFIX, **kwargs)