How to use the ftfy.fixes.uncurl_quotes function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / exquisite-corpus / exquisite_corpus / freq.py View on Github external
"""
    Take in multiple files of word counts by their filename, and produce a
    frequency list in the named output file. The counts should be in the format
    we produce that has a __total__ at the top. We merge them into a single
    frequency list using the 'figure skating average' defined above.
    """
    freq_dicts = []
    for input_filename in input_filenames:
        freq_dict = defaultdict(float)
        with open(input_filename, encoding='utf-8') as infile:
            total = None
            for line in infile:
                word, strcount = line.rstrip().split('\t', 1)
                # Correct for earlier steps that might not have handled curly
                # apostrophes consistently
                word = uncurl_quotes(word).strip("' ")
                if word:
                    count = int(strcount)
                    if word == '__total__':
                        total = count
                    else:
                        freq = count / total
                        if freq < 1e-9:
                            break
                        freq_dict[word] += freq
        freq_dicts.append(freq_dict)

    merged_dict = merge_freqs(freq_dicts)
    with open(output_filename, 'w', encoding='utf-8') as outfile:
        _write_frequency_file(merged_dict, outfile)
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if fix_entities:
            text = fixes.unescape_html(text)
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_text_encoding(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom:
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text
github IndicoDataSolutions / finetune / finetune / base_models / gpt / encoder.py View on Github external
def _text_standardize(text):
    """
    Fixes some issues the spacy tokenizer had on books corpus
    Also handles whitespace standardization
    """
    text = re.sub(
        """(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""",
        r" \1 ",
        text,
    )
    text = re.sub("\s*\n\s*", " \n ", text)
    text = re.sub("[^\S\n]+", " ", text)
    return uncurl_quotes(text.strip().lower())
github LuminosoInsight / python-ftfy / ftfy / __init__.py View on Github external
if fix_entities == 'auto' and '<' in text and '>' in text:
        fix_entities = False
    while True:
        origtext = text
        if remove_terminal_escapes:
            text = fixes.remove_terminal_escapes(text)
        if fix_encoding:
            text = fixes.fix_encoding(text)
        if fix_entities:
            text = fixes.unescape_html(text)
        if fix_latin_ligatures:
            text = fixes.fix_latin_ligatures(text)
        if fix_character_width:
            text = fixes.fix_character_width(text)
        if uncurl_quotes:
            text = fixes.uncurl_quotes(text)
        if fix_line_breaks:
            text = fixes.fix_line_breaks(text)
        if fix_surrogates:
            text = fixes.fix_surrogates(text)
        if remove_control_chars:
            text = fixes.remove_control_chars(text)
        if remove_bom and not remove_control_chars:
            # Skip this step if we've already done `remove_control_chars`,
            # because it would be redundant.
            text = fixes.remove_bom(text)
        if normalization is not None:
            text = unicodedata.normalize(normalization, text)
        if text == origtext:
            return text