How to use the langcodes.get function in langcodes

To help you get started, we’ve selected a few langcodes examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / wordfreq / __init__.py View on Github external
# TODO: decrease the maximum distance. This distance is so high just
    # because it allows a test where 'yue' matches 'zh', and maybe the
    # distance between those is high because they shouldn't match.
    best, _distance = langcodes.closest_match(
        lang, list(available), max_distance=70
    )
    if best == 'und':
        raise LookupError("No wordlist %r available for language %r"
                          % (wordlist, lang))

    if best != lang:
        logger.warning(
            "You asked for word frequencies in language %r. Using the "
            "nearest match, which is %r (%s)."
            % (lang, best, langcodes.get(best).language_name('en'))
        )

    return read_cBpack(available[best])
github toolforge / video2commons / video2commons / backend / subtitles / __init__.py View on Github external
subtitles, wikifilename, username,
    statuscallback=None, errorcallback=None
):
    """Convert and upload subtitles to corresponding TimedText pages."""
    statuscallback = statuscallback or (lambda text, percent: None)
    errorcallback = errorcallback or (lambda text: None)

    percent = 0
    c = Converter(
        ffmpeg_path='/usr/bin/ffmpeg',
        ffprobe_path='/usr/bin/ffprobe'
    )

    for langcode, filename in subtitles.items():
        try:
            lang = langcodes.get(langcode)
            langcode = str(lang).lower()

            langdesc = lang.describe()
            langname = langdesc['language']
            del langdesc['language']
            if langdesc:
                langname += u' (%s)' % ', '.join(langdesc.values())

            statuscallback(u'Loading subtitle in ' + langname, int(percent))
            subtitletext = ''

            info = c.probe(filename)
            if not info:
                continue
            if len(info.streams) != 1:
                continue
github LuminosoInsight / wordfreq / wordfreq / tokens.py View on Github external
The `external_wordlist` option only affects Chinese tokenization.  If it's
    True, then wordfreq will not use its own Chinese wordlist for tokenization.
    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.

    If you end up seeing tokens that are entire phrases or sentences glued
    together, that probably means you passed in CJK text with the wrong
    language code.
    """
    # Use globals to load CJK tokenizers on demand, so that we can still run
    # in environments that lack the CJK dependencies
    global _mecab_tokenize, _jieba_tokenize

    language = langcodes.get(lang)
    info = get_language_info(language)
    text = preprocess_text(text, language)

    if info['tokenizer'] == 'mecab':
        from wordfreq.mecab import mecab_tokenize as _mecab_tokenize

        # Get just the language code out of the Language object, so we can
        # use it to select a MeCab dictionary
        tokens = _mecab_tokenize(text, language.language)
        if not include_punctuation:
            tokens = [token for token in tokens if not PUNCT_RE.match(token)]
    elif info['tokenizer'] == 'jieba':
        from wordfreq.chinese import jieba_tokenize as _jieba_tokenize

        tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
        if not include_punctuation: