How to use the ftfy.fix_text function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mholtzscher / spacy_readability / tests / test_books.py View on Github external
def test_dale_chall(text, expected, nlp):
    text = ftfy.fix_text(text)
    text = " ".join(text.split())
    doc = nlp(text)
    assert pytest.approx(expected, rel=1e-2) == doc._.dale_chall
github mholtzscher / spacy_readability / tests / test_books.py View on Github external
def test_linsear_write(text, expected, nlp):
    text = ftfy.fix_text(text)
    text = " ".join(text.split())
    doc = nlp(text)
    assert pytest.approx(expected, rel=1e-2) == doc._.linsear_write
github LuminosoInsight / python-ftfy / tests / test_entities.py View on Github external
def test_entities():
    example = '&\n\n&'
    assert fix_text(example) == '&\n\n&'
    assert fix_text_segment(example) == '&\n\n&'

    assert fix_text(example, fix_entities=True) == '&\n\n&'
    assert fix_text_segment(example, fix_entities=True) == '&\n\n&'

    assert fix_text(example, fix_entities=False) == '&\n\n&'
    assert fix_text_segment(example, fix_entities=False) == '&\n\n&'

    assert fix_text_segment('<>', fix_entities=False) == '<>'
    assert fix_text_segment('<>', fix_entities=True) == '<>'
    assert fix_text_segment('<>') == '<>'
    assert fix_text_segment('jednocześnie') == 'jednocześnie'
    assert fix_text_segment('JEDNOCZEŚNIE') == 'JEDNOCZEŚNIE'
    assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('ellipsis…', normalization='NFKC') == 'ellipsis...'
    assert fix_text_segment('broken') == 'broken\x81'
    assert unescape_html('euro €') == 'euro €'
    assert unescape_html('not an entity x6;') == 'not an entity x6;'
github mdda / deep-learning-workshop / notebooks / work-in-progress / 2018-10_ZeroShotRelationships / text_utils.py View on Github external
def encode_and_clean(self, text):
        texts_bpes, texts_clean, lens_bpes = [], [], []
        if True:
            for text in texts:
                text = self.nlp(text_standardize(ftfy.fix_text(text)))
                text_tokens, text_bpe, len_bpe = [], [], []
                for token in text:
                    token_text = token.text
                    text_tokens.append(token_text)
                    new_bpe = [self.encoder.get(t, 0) for t in self.bpe(token_text.lower()).split(' ')]
                    text_bpe.extend(new_bpe)
                    len_bpe.append(len(new_bpe))
                texts_clean.append(' '.join(text_tokens))  # Reassemble
                texts_bpes.append(text_bpe)
                lens_bpes.append(len_bpe)
        return texts_bpes, texts_clean, lens_bpes
github bitextor / bitextor / bitextor-get-html.py View on Github external
fields = line.split('\t')
    fields = list(map(str.strip, fields)) #Strip all elements
    del fields[-1]
    #sys.stderr.write("fields:" + str(len(fields)) + " " + str(fields) + "\n")

    cleaner=Cleaner(style=True, links=True, add_nofollow=True,page_structure=False, safe_attrs_only=False)

    # read file
    file = open("{inDir}/{name}".format(inDir=args.inDir, name=lineNum), "r")
    b64t = file.read()
    file.close()
    #sys.stderr.write("b64t:" + b64t + "\n")

    try:
        cleanhtml=cleaner.clean_html(re.sub(r'encoding *= *"[^"]+"', '', b64t, flags=re.IGNORECASE))
        document = html5lib.parse(ftfy.fix_text(cleanhtml),treebuilder="lxml",namespaceHTMLElements=False)
        tree=etree.tostring(document)
        cleantree=tree.decode("utf8")
        cleantree = cleantree.replace("\t", " ")

        file = open("{outDir}/{name}".format(outDir=args.outDir, name=lineNum), "w")
        file.write(cleantree)
        file.close()
    except etree.ParserError as err:
        sys.stderr.write("HTML parsing error for document with URL '{1}': {0}\n".format(err, fields[0]))

    lineNum += 1
github IndicoDataSolutions / finetune / finetune / encoding.py View on Github external
def _text_standardize(text):
    """
    Fixes some issues the spacy tokenizer had on books corpus
    Also handles whitespace standardization
    """
    text = text.replace('—', '-')
    text = text.replace('–', '-')
    text = text.replace('―', '-')

    text = text.replace('…', '...')
    text = text.replace('´', "'")
    text = re.sub('''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
    text = re.sub('\s*\n\s*', ' \n ', text)
    text = re.sub('[^\S\n]+', ' ', text)
    return ftfy.fix_text(text.strip().lower())
github shobrook / BitVision / research / src / modules / scraper.py View on Github external
def collect_articles(urls, end_date, filename):
    """Loops over all the URLs collected in the parent function."""

    for url in urls:
        tree = parse_html(url)
        config = page_config(tree)

        try:
            if end_date and dateParse(config["date"]) < dateParse(end_date):
                break
            else:
                csv_writer = csv.writer(open(os.path.dirname(os.getcwd()) + "/../data/" + filename, "a"))
                csv_writer.writerow([config["date"], ftfy.fix_text(config["title"]), url])
        except:
            print("\nEXCEPTION OCCURED\n")
            pass
github furas / python-examples / decode-encode / macosx-linux / main.py View on Github external
text3 = unidecode(text2)
    text4 = unicodedata.normalize('NFC', text2)

    text5 = unidecode(text4)

    print('                                text:', text, '| len:', len(text))
    print('                            expected:', expected, '  | len:', len(expected))
    print('                    text == expected:', text == expected)
    print('-------------------------------------')
    print('text.encode("cp437").decode("utf-8"):', text2, '  | len:', len(text2), '| expected:', text2 == expected)
    print('                      unicode(text2):', text3, '  | len:', len(text3), '| expected:', text3 == expected)
    print('-------------------------------------')
    print(' unicodedata.normalize("NFC", text2):', text4, '  | len:', len(text4), '| expected:', text4 == expected)
    print('                      unicode(text4):', text5, '  | len:', len(text5), '| expected:', text5 == expected)
    print('-------------------------------------')
    print('                 ftfy.fix_text(text):', ftfy.fix_text(text))
    print('-------------------------------------')
github jeremyjbowers / pool / pool / utils.py View on Github external
def clean_unicode(possible_string):
    if isinstance(possible_string, basestring):
        string = possible_string
        string = string.strip()
        string = string.decode('utf-8')
        string = unicode(string)
        string = ftfy.fix_text(string)
        return string
    return possible_string
github shobrook / BitVision / src / modules / news_scraper.py View on Github external
def collect_articles(urls, source, args, filename):
	"""Loops over all the URLs collected in the parent function."""
	for url in urls:
		tree = parse_html(url)
		config = page_config(source, tree)

		print(url)

		if args.scrape_year and dateParse(config["date"]).year < int(args.scrape_year): break
		elif args.scrape_year and int(str(dateParse(config["date"]).year)) != int(args.scrape_year): pass
		else:
			csv_writer = csv.writer(open(filename, "a"))
			csv_writer.writerow([config["date"], ftfy.fix_text(config["title"]), url])