How to use the wordfreq.word_frequency function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_minimums():
    assert word_frequency('esquivalience', 'en') == 0
    assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
    assert word_frequency('the', 'en', minimum=1) == 1
github LuminosoInsight / wordfreq / tests / test_korean.py View on Github external
def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('감사합니다', 'ko') ==
        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
    )
github LuminosoInsight / wordfreq / tests / test_general.py View on Github external
def test_language_matching():
    freq = word_frequency('的', 'zh')
    assert word_frequency('的', 'zh-TW') == freq
    assert word_frequency('的', 'zh-CN') == freq
    assert word_frequency('的', 'zh-Hant') == freq
    assert word_frequency('的', 'zh-Hans') == freq
    assert word_frequency('的', 'yue-HK') == freq
    assert word_frequency('的', 'cmn') == freq
github LuminosoInsight / wordfreq / tests / test_chinese.py View on Github external
def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
github hhexiy / pungen / scripts / compose_HIT.py View on Github external
def compose_collaborative_pun_hit(data_dict, key_filter, outfile, top_k=5):
    with open(outfile, 'w') as outf:
        header = ['Pun_alter']
        for i in range(top_k):
            header.append('sentence_'+str(i+1))
        assert len(header) == top_k + 1
        outf.write(','.join(header)+'\n')
        for key in key_filter:
            results = data_dict[key]
            if word_frequency(key[0], 'en') < 1e-6 or word_frequency(key[1], 'en') < 1e-6 :
                print('skip the keyword pair:', ' '.join(key))
                continue
            contents = []
            contents.append('-'.join(key))
            if type(results) is tuple:
                results = results[0]
            for res in results[:top_k]:
                contents.append(res)
            #print(type(contents), contents)
            outf.write(','.join(contents)+'\n')
github LuminosoInsight / python-ftfy / scripts / mojibakery.py View on Github external
def add_language_trigrams(normal_freqs, baked_freqs, language):
    """
    Collect the trigram frequencies of both correct and mojibaked text, using
    word examples from the given language.
    """
    for baseword in wordfreq.iter_wordlist(language):
        freq = wordfreq.word_frequency(baseword, language)
        for word in set([baseword, baseword.upper()]):
            if any(letter.isdigit() for letter in word):
                continue
            for frame in FRAMES:
                padded = frame % word
                for trigram in get_trigrams(padded):
                    normal_freqs[trigram] += freq

                for enc1 in COMMON_ENCODINGS + LANGUAGE_ENCODINGS[language]:
                    for enc2 in COMMON_ENCODINGS + LANGUAGE_ENCODINGS[language]:
                        if enc1 != enc2 and (enc1 not in COMMON_ENCODINGS or enc2 not in COMMON_ENCODINGS):
                            try:
                                mojibaked = padded.encode(enc1).decode(enc2)
                                if mojibaked != padded:
                                    for trigram in get_trigrams(mojibaked):
                                        baked_freqs[(trigram, enc2, enc1)] += freq
github coreybobco / generativepoetry-py / generativepoetry / utils.py View on Github external
out word if less frequent than this threshold
    """
    # Datamuse is built from webscraping and occasionally returns offensive and oppressive language, which I am here
    # adding to filter out. Although there is an appropriate and even critical way for humans to write poetry using some
    # of these words that might be considered edge cases (e.g. Hottentot), a stochastic text generator does not have
    # a historical sense to do that, so I have decided to exclude these.
    unfitting_words = pkgutil.get_data('generativepoetry', 'data/hate_words.txt').decode("utf-8").splitlines()
    unfitting_words.extend(pkgutil.get_data('generativepoetry', 'data/abbreviations_etc.txt').decode("utf-8")
                           .splitlines())
    exclude_words.extend(unfitting_words)  # Some words Datamuse tends to return that disruptive poetic flow
    validate_str(string)
    if len(string) < 3:
        return False
    if has_invalid_characters(string):
        return False
    if word_frequency(string, 'en') < word_frequency_threshold:
        return False
    if spellcheck and not hobj.spell(string):
        return False
    if string in exclude_words:
        return False
    return True
github OWASP / passfault / wordlists / languageWordlists.py View on Github external
integral80 += word_frequency(nl[i], 'nl', wordlist='large')
    if (integral80 <= 0.80*integral100):
        nlPopular.write(nl[i] + '\n')
    else:
        nlLongTail.write(nl[i] + '\n')

nlPopular.close()
nlLongTail.close()

#---------------------------------------------------------------
ptPopular = open(dest + '/ptPopular.txt', 'w')
ptLongTail = open(dest + '/ptLongTail.txt', 'w')

integral100 = 0
for i in range(len(pt)):
    integral100 += word_frequency(pt[i], 'pt', wordlist='large')

integral80 = 0
for i in range(len(pt)):
    integral80 += word_frequency(pt[i], 'pt', wordlist='large')
    if (integral80 <= 0.80*integral100):
        ptPopular.write(pt[i] + '\n')
    else:
        ptLongTail.write(pt[i] + '\n')

ptPopular.close()
ptLongTail.close()

#---------------------------------------------------------------
svPopular = open(dest + '/svPopular.txt', 'w')
svLongTail = open(dest + '/svLongTail.txt', 'w')
github SPOClab-ca / COVFEFE / nodes / lexicosyntactic_multi.py View on Github external
def compute_word_frequency_norms(self):
        freqs = []
        for char in self.tokens:
            freq = wordfreq.word_frequency(char, 'zh')

            if freq == 0:
                continue

            freqs.append(freq)

        try:
            self.features['mean_word_frequency'] = statistics.mean(freqs)
            self.features['median_word_frequency'] = statistics.median(freqs)
        except:
            self.features['mean_word_frequency'] = 0
            self.features['median_word_frequency'] = 0
github OWASP / passfault / wordlists / wordlist scripts / wordlists.py View on Github external
nlLongTail.write(nl[i] + '\n')

nlPopular.close()
nlLongTail.close()

#---------------------------------------------------------------
ptPopular = open(dest + '/ptPopular.words', 'w')
ptLongTail = open(dest + '/ptLongTail.words', 'w')

integral100 = 0
for i in range(len(pt)):
    integral100 += word_frequency(pt[i], 'pt', wordlist='large')

integral80 = 0
for i in range(len(pt)):
    integral80 += word_frequency(pt[i], 'pt', wordlist='large')
    if (integral80 <= 0.80*integral100):
        ptPopular.write(pt[i] + '\n')
    else:
        ptLongTail.write(pt[i] + '\n')

ptPopular.close()
ptLongTail.close()


integralList = []
integral = 0
f = []
for i in range(0, len(pt)):
    f.insert(i, word_frequency(de[i], 'pt', wordlist='large')/word_frequency(de[0], 'pt', wordlist='large'))
    integral += word_frequency(pt[i], 'pt', wordlist='large')
    integralList.insert(i, integral)