How to use the wordfreq.util.standardize_word function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / wordfreq / build.py View on Github external
def _read_csv_basic(filename):
    infile = codecs.open(filename, encoding='utf-8')

    counts = {}
    for line in infile:
        if ',' in line:
            line = line.rstrip('\n')
            word, count = line.rsplit(',', 1)
            count = float(count)
            counts[standardize_word(word)] = count
    return counts
github LuminosoInsight / wordfreq / wordfreq / build.py View on Github external
"""
    Load word frequencies from a "Web as Corpus" file, collected and
    provided by the University of Leeds.

    For more information, see: http://corpus.leeds.ac.uk/list.html
    """
    infile = codecs.open(filename, encoding='utf-8')

    counts = defaultdict(float)
    for line in infile:
        line = line.rstrip()
        if line:
            rank = line.split(' ')[0]
            if NUMBER_RE.match(rank) and line.count(' ') == 2:
                _, freq, token = line.split(' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq

    return _scale_freqs(counts)
github LuminosoInsight / wordfreq / wordfreq / query.py View on Github external
def word_frequency(word, lang, wordlist='multi', offset=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
    specified `wordlist`.

    The offset gets added to all values, to monotonically account for the
    fact that we have not observed all possible words.
    """
    c = CONN.cursor()
    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
              (standardize_word(word), lang, wordlist))
    row = c.fetchone()
    if row is None:
        return offset
    else:
        return row[0] + offset