How to use wordfreq - 10 common examples

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_minimums():
    assert word_frequency('esquivalience', 'en') == 0
    assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
    assert word_frequency('the', 'en', minimum=1) == 1
github LuminosoInsight / wordfreq / tests / test_korean.py View on Github external
def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('감사합니다', 'ko') ==
        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
    )
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_language_matching():
    freq = word_frequency('的', 'zh')
    assert word_frequency('的', 'zh-TW') == freq
    assert word_frequency('的', 'zh-CN') == freq
    assert word_frequency('的', 'zh-Hant') == freq
    assert word_frequency('的', 'zh-Hans') == freq
    assert word_frequency('的', 'yue-HK') == freq
    assert word_frequency('的', 'cmn') == freq
github LuminosoInsight / wordfreq / tests / test_chinese.py View on Github external
def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_arabic():
    # Remove tatweels
    assert tokenize('متــــــــعب', 'ar') == ['متعب']

    # Remove combining marks
    assert tokenize('حَرَكَات', 'ar') == ['حركات']

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
github LuminosoInsight / wordfreq / tests / test_chinese.py View on Github external
def test_alternate_codes():
    # Tokenization of Chinese works when you use other language codes
    # that are not equal to 'zh'.
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
    assert tokenize('谢谢谢谢', 'zh-CN') == tokens

    # Over-long codes for Chinese
    assert tokenize('谢谢谢谢', 'chi') == tokens
    assert tokenize('谢谢谢谢', 'zho') == tokens

    # Separate codes for Mandarin and Cantonese
    assert tokenize('谢谢谢谢', 'cmn') == tokens
    assert tokenize('谢谢谢谢', 'yue') == tokens
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_casefolding():
    assert tokenize('WEISS', 'de') == ['weiss']
    assert tokenize('weiß', 'de') == ['weiss']
    assert tokenize('İstanbul', 'tr') == ['istanbul']
    assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
    assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']

    # Test Khmer, a script similar to Thai
    assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']

    # Remove vowel points in Hebrew
    assert tokenize('דֻּגְמָה', 'he') == ['דגמה']

    # Deal with commas, cedillas, and I's in Turkish
    assert tokenize('kișinin', 'tr') == ['kişinin']
    assert tokenize('KİȘİNİN', 'tr') == ['kişinin']

    # Deal with cedillas that should be commas-below in Romanian
    assert tokenize('acelaşi', 'ro') == ['același']
    assert tokenize('ACELAŞI', 'ro') == ['același']
github LuminosoInsight / wordfreq / tests / test_transliteration.py View on Github external
def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert (
        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )
    assert (
        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text('китабхана', 'az') == 'kitabxana'
    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text('бағырты', 'az') == 'bağırtı'
    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'