How to use the ftfy.fixes.unescape_html function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / python-ftfy / tests / test_entities.py View on Github external
assert fix_text(example, fix_entities=True) == '&\n\n&'
    assert fix_text_segment(example, fix_entities=True) == '&\n\n&'

    assert fix_text(example, fix_entities=False) == '&\n\n&'
    assert fix_text_segment(example, fix_entities=False) == '&\n\n&'

    assert fix_text_segment('<>', fix_entities=False) == '<>'
    assert fix_text_segment('<>', fix_entities=True) == '<>'
    assert fix_text_segment('<>') == '<>'
    assert fix_text_segment('jednocześnie') == 'jednocześnie'
    assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
    assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('broken') == 'broken\x81'
    assert unescape_html('euro €') == 'euro €'
    assert unescape_html('not an entity x6;') == 'not an entity x6;'
github LuminosoInsight / python-ftfy / ftfy / streamtester / __init__.py View on Github external
def check_ftfy(self, text, encoding_only=True):
        """
        Given a single text input, check whether `ftfy.fix_text_encoding`
        would change it. If so, display the change.
        """
        self.count += 1
        text = unescape_html(text)
        if not possible_encoding(text, 'ascii'):
            if encoding_only:
                fixed = fix_encoding(text)
            else:
                fixed = fix_text(text, uncurl_quotes=False, fix_character_width=False)
            if text != fixed:
                # possibly filter common bots before printing
                print('\nText:\t{text!r}\nFixed:\t{fixed!r}\n'.format(
                    text=text, fixed=fixed
                ))
                self.num_fixed += 1
            elif 'â€' in text or '\x80' in text:
                print('\nNot fixed:\t{text!r}'.format(text=text))

        # Print status updates once in a while
        if self.count % 100 == 0:
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
github chartbeat-labs / textacy / textacy / corpora / wiki_reader.py View on Github external
# replace internal links with just their labels
    text = replace_internal_links(text)
    # text = replace_internal_links(text)  # TODO: is this needed?

    # remove table markup
    text = text.replace('||', '\n|').replace('!!', '\n!')  # put each cell on a separate line
    text = re_table_formatting.sub('\n', text)  # remove formatting lines
    text = re_table_cell_formatting.sub('\n\\3', text)  # leave only cell content

    # strip out text formatting
    text = re_italic_quote.sub(r'"\1"', text)
    text = re_bold_italic.sub(r'\1', text)
    text = re_quote_quote.sub(r'"\1"', text)

    # unescape html entities
    text = ftfy.fixes.unescape_html(text)

    # final cleanup
    text = re_headings.sub(r'\n\n\2\n\n', text)
    text = re_dots.sub('...', text)
    text = re_brackets.sub(r'', text)
    text = text.replace('[[', '').replace(']]', '')
    text = text.replace('<<', '«').replace('>>', '»')
    text = re_random_cruft.sub(r'\1', text)
    text = re.sub(r'\n\W+?\n', r'\n', text, flags=re.UNICODE)
    text = text.replace(',,', ',').replace(',.', '.')
    text = re_spaces.sub(' ', text)
    text = re_linebreaks.sub(r'\n\n', text)

    return text.strip()
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
"""
    Apply fixes to text in a single chunk. This could be a line of text
    within a larger run of `fix_text`, or it could be a larger amount
    of text that you are certain is in a consistent encoding.

    See `fix_text` for a description of the parameters.
    """
    if isinstance(text, bytes):
        raise UnicodeError(fixes.BYTES_ERROR_TEXT)

    if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
github chartbeat-labs / textacy / textacy / datasets / wikipedia.py View on Github external
text = _replace_external_links(text)
    # drop magic words behavioral switches
    text = re_magic_words.sub("", text)
    # replace internal links with just their labels
    text = _replace_internal_links(text)
    # text = _replace_internal_links(text)  # TODO: is this needed?
    # remove table markup
    text = text.replace("||", "\n|").replace("!!", "\n!")  # put each cell on a new line
    text = re_table_formatting.sub("\n", text)  # remove formatting lines
    text = re_table_cell_formatting.sub("\n\\3", text)  # leave only cell content
    # strip out text formatting
    text = re_italic_quote.sub(r'"\1"', text)
    text = re_bold_italic.sub(r"\1", text)
    text = re_quote_quote.sub(r'"\1"', text)
    # unescape html entities
    text = ftfy.fixes.unescape_html(text)
    # final cleanup
    if include_headings is True:
        text = re_headings.sub(r"\n\n\2\n\n", text)
    else:
        text = re_headings.sub(r"\n\n", text)
    text = re_dots.sub("...", text)
    text = re_brackets.sub(r"", text)
    text = text.replace("[[", "").replace("]]", "")
    text = text.replace("<<", "«").replace(">>", "»")
    text = re_random_cruft.sub(r"\1", text)
    text = re.sub(r"\n\W+?\n", r"\n", text, flags=re.UNICODE)
    text = text.replace(",,", ",").replace(",.", ".")
    text = re_spaces.sub(" ", text)
    text = re_linebreaks.sub(r"\n\n", text)
    return text.strip()