How to use the ftfy.badness.sequence_weirdness function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unicode_9():
    # This string is 'bɪg'.upper() in Python 3.6 or later, containing the
    # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
    assert sequence_weirdness("BꞮG") == 0

    # That should be less weird than having a definitely-unassigned character
    # in the string.
    assert sequence_weirdness("B\U00090000G") == 2
github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_emoji_variation_selector():
    # The hearts here are explicitly marked as emoji using the variation
    # selector U+FE0F. This is not weird.
    assert sequence_weirdness('❤\ufe0f' * 10) == 0
github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unicode_9():
    # This string is 'bɪg'.upper() in Python 3.6 or later, containing the
    # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
    assert sequence_weirdness("BꞮG") == 0

    # That should be less weird than having a definitely-unassigned character
    # in the string.
    assert sequence_weirdness("B\U00090000G") == 2
github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = chr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    assert fixed == char
                    assert apply_plan(garb, plan) == char
github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text
github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unicode_10():
    # This string is the word "thalīṃ" in the Zanabazar Square Script,
    # a script added in Unicode 10. These characters are recognized as being
    # assigned by Python 3.7, and therefore ftfy should recognize them on
    # all versions for consistency.
    thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
    assert sequence_weirdness(thalim) == 0
github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_emoji_skintone_selector():
    # Dear heuristic, you can't call skin-tone selectors weird anymore.
    # We welcome Santa Clauses of all colors.
    assert sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻') == 0