How to use the ftfy.fixes.fix_encoding function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / python-ftfy / ftfy / streamtester / __init__.py View on Github external
def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
            print('.', end='', flush=True)
        if self.count % 10000 == 0:
            print('\n%d/%d fixed' % (self.num_fixed, self.count))
github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    assert fix_encoding(emojibake) == emoji_text

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    assert fix_encoding(emojibake) == emoji_text

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    assert fix_encoding(not_emoji) == not_emoji
github LuminosoInsight / python-ftfy / tests / test_futuristic_codepoints.py View on Github external
def test_unknown_emoji():
    # The range we accept as emoji has gotten larger. Let's make sure we can
    # decode the futuristic emoji U+1F960, which will probably be a picture of
    # a fortune cookie in Unicode 10.0:
    emoji_text = "\U0001f960 I see emoji in your future"
    emojibake = emoji_text.encode('utf-8').decode('windows-1252')
    assert fix_encoding(emojibake) == emoji_text

    # We believe enough in the future of this codepoint that we'll even
    # recognize it with a mangled byte A0
    emojibake = emojibake.replace('\xa0', ' ')
    assert fix_encoding(emojibake) == emoji_text

    # Increment the first byte to get a very similar test case, but a
    # codepoint that will definitely not exist anytime soon. In this case,
    # we consider the existing text, "ñŸ¥\xa0", to be more probable.
    not_emoji = "\U0005f960 I see mojibake in your present".encode('utf-8').decode('windows-1252')
    assert fix_encoding(not_emoji) == not_emoji
github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_byte_order_mark():
    assert fix_encoding('') == '\ufeff'
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
uncurl_quotes=uncurl_quotes,
                fix_latin_ligatures=fix_latin_ligatures,
                fix_character_width=fix_character_width,
                fix_line_breaks=fix_line_breaks,
                fix_surrogates=fix_surrogates,
                remove_control_chars=remove_control_chars,
                remove_bom=remove_bom,
                normalization=normalization
            )
        )
        pos = textbreak

    return ''.join(out)

ftfy = fix_text
fix_encoding = fixes.fix_encoding


def fix_file(input_file,
             encoding=None,
             fix_entities='auto',
             remove_terminal_escapes=True,
             fix_encoding=THOROUGH,
             fix_latin_ligatures=True,
             fix_character_width=True,
             uncurl_quotes=True,
             fix_line_breaks=True,
             fix_surrogates=True,
             remove_control_chars=True,
             remove_bom=True,
             normalization='NFC'
             ):
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
fix_character_width=fix_character_width,
                fix_line_breaks=fix_line_breaks,
                fix_surrogates=fix_surrogates,
                remove_control_chars=remove_control_chars,
                remove_bom=remove_bom,
                normalization=normalization,
            )
        )
        pos = textbreak

    return ''.join(out)


# Some alternate names for the main functions
ftfy = fix_text
fix_encoding = fixes.fix_encoding
fix_text_encoding = fixes.fix_text_encoding  # deprecated


def fix_file(
    input_file,
    encoding=None,
    *,
    fix_entities='auto',
    remove_terminal_escapes=True,
    fix_encoding=True,
    fix_latin_ligatures=True,
    fix_character_width=True,
    uncurl_quotes=True,
    fix_line_breaks=True,
    fix_surrogates=True,
    remove_control_chars=True,