How to use the ftfy.ftfy function in ftfy

To help you get started, we’ve selected a few ftfy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / wordfreq / build.py View on Github external
"""
    Load word frequencies from a "Web as Corpus" file, collected and
    provided by the University of Leeds.

    For more information, see: http://corpus.leeds.ac.uk/list.html
    """
    infile = codecs.open(filename, encoding='utf-8')

    counts = defaultdict(float)
    for line in infile:
        line = line.rstrip()
        if line:
            rank = line.split(' ')[0]
            if NUMBER_RE.match(rank) and line.count(' ') == 2:
                _, freq, token = line.split(' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq

    return _scale_freqs(counts)
github commonsense / metanl / metanl / leeds_corpus_reader.py View on Github external
def leeds_corpus_frequencies(corpusfile, stemmer):
    if stemmer is None:
        stemmer = lambda x: x

    infile = codecs.open(corpusfile, encoding='utf-8')

    freqs = defaultdict(int)
    tokenfreqs = defaultdict(int)
    for line in infile:
        line = ftfy(line.strip())
        if line:
            rank = line.split(' ')[0]
            if NUMBER_RE.match(rank) and line.count(' ') == 2:
                rank, freq, token = line.split(' ')
                stemmed = stemmer(token)
                print "%s -> %s" % (token, stemmed)
                freq = float(freq)
                freq_int = int(freq*100)
                for word in stemmed.split(' '):
                    if ',' not in word:
                        freqs[word] += freq_int
                if ',' not in token:
                    tokenfreqs[token.lower()] += freq_int
    for key in tokenfreqs:
        if tokenfreqs[key] > freqs[key]:
            freqs[key] = tokenfreqs[key]
github baieric / chatstats / chatstats.py View on Github external
    data['content'] = data['content'].apply(lambda x: ftfy.ftfy(x) if type(x) == str else x)