Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# TODO: decrease the maximum distance. This distance is so high just
# because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match(
lang, list(available), max_distance=70
)
if best == 'und':
raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang))
if best != lang:
logger.warning(
"You asked for word frequencies in language %r. Using the "
"nearest match, which is %r (%s)."
% (lang, best, langcodes.get(best).language_name('en'))
)
return read_cBpack(available[best])
subtitles, wikifilename, username,
statuscallback=None, errorcallback=None
):
"""Convert and upload subtitles to corresponding TimedText pages."""
statuscallback = statuscallback or (lambda text, percent: None)
errorcallback = errorcallback or (lambda text: None)
percent = 0
c = Converter(
ffmpeg_path='/usr/bin/ffmpeg',
ffprobe_path='/usr/bin/ffprobe'
)
for langcode, filename in subtitles.items():
try:
lang = langcodes.get(langcode)
langcode = str(lang).lower()
langdesc = lang.describe()
langname = langdesc['language']
del langdesc['language']
if langdesc:
langname += u' (%s)' % ', '.join(langdesc.values())
statuscallback(u'Loading subtitle in ' + langname, int(percent))
subtitletext = ''
info = c.probe(filename)
if not info:
continue
if len(info.streams) != 1:
continue
The `external_wordlist` option only affects Chinese tokenization. If it's
True, then wordfreq will not use its own Chinese wordlist for tokenization.
Instead, it will use the large wordlist packaged with the Jieba tokenizer,
and it will leave Traditional Chinese characters as is. This will probably
give more accurate tokenization, but the resulting tokens won't necessarily
have word frequencies that can be looked up.
If you end up seeing tokens that are entire phrases or sentences glued
together, that probably means you passed in CJK text with the wrong
language code.
"""
# Use globals to load CJK tokenizers on demand, so that we can still run
# in environments that lack the CJK dependencies
global _mecab_tokenize, _jieba_tokenize
language = langcodes.get(lang)
info = get_language_info(language)
text = preprocess_text(text, language)
if info['tokenizer'] == 'mecab':
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
# Get just the language code out of the Language object, so we can
# use it to select a MeCab dictionary
tokens = _mecab_tokenize(text, language.language)
if not include_punctuation:
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
elif info['tokenizer'] == 'jieba':
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
if not include_punctuation: