How to use the wordfreq.lossy_tokenize function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
github LuminosoInsight / wordfreq / tests / test_at_sign.py View on Github external
assert tokenize(text, "es") == [
        "la",
        "protección",
        "de",
        "los",
        "derechos",
        "de",
        "tod@s",
        "l@s",
        "trabajador@s",
        "migrantes"
    ]

    text = "el distrito 22@ de Barcelona"
    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
    assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]

    # It also appears in Portuguese
    text = "direitos e deveres para @s membr@s da comunidade virtual"
    assert tokenize(text, "pt") == [
        "direitos",
        "e",
        "deveres",
        "para",
        "@s",
        "membr@s",
        "da",
        "comunidade",
        "virtual"
    ]

    # Because this is part of our tokenization, the language code doesn't
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')