Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_minimums():
assert word_frequency('esquivalience', 'en') == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
assert (
1.0 / word_frequency('감사합니다', 'ko') ==
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
)
def test_language_matching():
freq = word_frequency('的', 'zh')
assert word_frequency('的', 'zh-TW') == freq
assert word_frequency('的', 'zh-CN') == freq
assert word_frequency('的', 'zh-Hant') == freq
assert word_frequency('的', 'zh-Hans') == freq
assert word_frequency('的', 'yue-HK') == freq
assert word_frequency('的', 'cmn') == freq
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_alternate_codes():
# Tokenization of Chinese works when you use other language codes
# that are not equal to 'zh'.
tokens = ['谢谢', '谢谢']
# Code with a region attached
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
# Over-long codes for Chinese
assert tokenize('谢谢谢谢', 'chi') == tokens
assert tokenize('谢谢谢谢', 'zho') == tokens
# Separate codes for Mandarin and Cantonese
assert tokenize('谢谢谢谢', 'cmn') == tokens
assert tokenize('谢谢谢谢', 'yue') == tokens
def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss']
assert tokenize('weiß', 'de') == ['weiss']
assert tokenize('İstanbul', 'tr') == ['istanbul']
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
# Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
# Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
# Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
# Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin']
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
# Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același']
assert tokenize('ACELAŞI', 'ro') == ['același']
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles
assert (
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
assert (
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
)
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
assert preprocess_text('китабхана', 'az') == 'kitabxana'
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
# 'scream' in Azerbaijani Cyrillic
assert preprocess_text('бағырты', 'az') == 'bağırtı'
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'