How to use the ftfy.fixes.fix_surrogates function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_surrogates():
    assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
    assert fix_surrogates('\ud800\udc00') == '\U00010000'
github LuminosoInsight / python-ftfy / tests / test_characters.py View on Github external
def test_surrogates():
    assert fix_surrogates('\udbff\udfff') == '\U0010ffff'
    assert fix_surrogates('\ud800\udc00') == '\U00010000'
github LuminosoInsight / exquisite-corpus / exquisite_corpus / preprocess.py View on Github external
This function reads just the text (the part after the tab, if there is a tab). It
    removes URLs and Twitter handles from the text. It then language-detects the
    text, and if it is confident about the language, it outputs a new tab-separated
    file containing the language code and the processed text.

    This format could be read again by the same function, because the language code
    is now the metadata, but we have no reason to actually do this.
    """
    for line in infile:
        if "\t" in line:
            line = line.split("\t", 1)[1]
        text = line.rstrip()
        text = TWITTER_HANDLE_RE.sub("", text)
        text = TCO_RE.sub("", text)
        text = fix_surrogates(unescape_html(text)).replace("\n", " ")
        lang, confident = detect_language(text)
        if confident:
            print(f"{lang}\t{text}", file=outfile)
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
github LuminosoInsight / exquisite-corpus / exquisite_corpus / tokens.py View on Github external
def tokenize_file(
    infile, outfile, language, check_language=False, punctuation=False, ftfy=False
):
    """
    Take in a file of plain text, tokenize it as the given language, and write
    the result as lines of space-separated tokens.
    """
    for line in infile:
        if ftfy:
            # Run all ftfy fixes, but don't let it introduce line breaks
            line = fix_text(line.rstrip()).replace('\n', ' ')
        else:
            # Run only specific quick fixes from ftfy
            line = fix_surrogates(unescape_html(line.rstrip()))
        tokens = tokenize(
            line, language, include_punctuation=punctuation, external_wordlist=True
        )
        checked_lang = None
        if check_language:
            checked_lang, _confident = detect_language(line.rstrip())
        if (not check_language) or langcodes.tag_match_score(
            checked_lang, language
        ) >= 90:
            print(' '.join(tokens), file=outfile)