How to use the wordfreq.zipf_frequency function in wordfreq

To help you get started, we’ve selected a few wordfreq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kmicklas / sentence-pairs / sort.py View on Github external
if len(sys.argv) != 3:
    print('Usage: python3 sort.py target-lang pairs.csv')
    sys.exit(1)

targetLang = sys.argv[1]
pairsPath = sys.argv[2]

pairs = {}

with open(pairsPath, 'r', encoding='utf-8') as pairsFile:
    reader = csv.reader(pairsFile, delimiter='\t')
    for row in reader:
        words = wordfreq.tokenize(html.unescape(row[0]), targetLang)

        freqs = [wordfreq.zipf_frequency(word, targetLang, wordlist='combined')
                     for word in words]

        minfreq = min(freqs)
        avgfreq = sum(freqs) / float(len(freqs))
        pairs[row[0]] = (minfreq, avgfreq, row[1])

pairList = list(pairs.items())
pairList.sort(reverse = True, key=lambda i: i[1])

for pair in pairList:
    sys.stdout.buffer.write((pair[0] + '\t' + pair[1][2] + '\n').encode('utf-8'))
github alex-lew / robot-mind-meld / server.py View on Github external
async def first_word(request):
    w = random.choice(list(index))
    while w not in first_words or wordfreq.zipf_frequency(w, 'en', wordlist="large") < 3.5:
        w = random.choice(list(index))
    return json({"word": w})
github alex-lew / robot-mind-meld / server.py View on Github external
def canUse(candidate, past):
    """
    Check whether a candidate is OK to use.
    """
    candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large")
    candidateRootFrequency = max(
        candidateFrequency,
        wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))

    # Reject words that are too infrequent or too frequent (like "a" or "the")
    if candidateFrequency < 2.3 or candidateFrequency > 6:
        return False

    # Mostly, this rejects '#'-containing words
    if not candidate.isalpha():
        return False

    # Is it a bad word?
    if candidate in bad_words:
        return False

    # Now, we check if we've used a related word before.
    if any(map(lambda w: lexicallyRelated(candidate, w), past)):
        return False
github NervanaSystems / nlp-architect / nlp_architect / solutions / trend_analysis / scoring_utils.py View on Github external
def get_freq_scores(self, group_similar_spans=True):
        phrases_and_scores = {}
        for _, noun_phrases in zip(self.documents, self.doc_text_spans):
            for p in noun_phrases:
                if p not in phrases_and_scores:
                    phrases_and_scores[p] = zipf_frequency(p.text, 'en')
        return self._maybe_group_and_sort(group_similar_spans,
                                          phrases_and_scores)
github alex-lew / robot-mind-meld / server.py View on Github external
def canUse(candidate, past):
    """
    Check whether a candidate is OK to use.
    """
    candidateFrequency = wordfreq.zipf_frequency(candidate, "en", wordlist="large")
    candidateRootFrequency = max(
        candidateFrequency,
        wordfreq.zipf_frequency(ps.stem(candidate), "en", wordlist="large"))

    # Reject words that are too infrequent or too frequent (like "a" or "the")
    if candidateFrequency < 2.3 or candidateFrequency > 6:
        return False

    # Mostly, this rejects '#'-containing words
    if not candidate.isalpha():
        return False

    # Is it a bad word?
    if candidate in bad_words:
        return False