How to use the nltk.FreqDist function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github aalind0 / NLP-Sentiment-Analysis-Twitter / View on Github external
for p in short_neg.split('\n'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:

# Pickling documents.
save_documents = open("documents.pickle","wb")
pickle.dump(documents, save_documents)

# Frequency Distribution
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

save_word_features = open("word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)

# Adjusting the feature finding function, using tokenizing by word in the document.
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
github spraakbanken / sparv-pipeline / sparv / View on Github external
def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1):
    """Train a probability model on a korp statistics file and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys
    and smoothed probabilities as values."""
    fdist = FreqDist()
    with open(stats_infile, encoding='utf-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            # Skip word forms that occur fewer times than min_freq
            if int(fields[4]) < min_freq:
            # Get rid of all urls
            if word.startswith("http://"):
            # # Words that only occur once may only contain letters and hyphens
            # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word):
            #     continue
            # if len(word) > 100:
            #     continue
            simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1]
github ellarabi / reddit-l2 / View on Github external
def create_features_map(cfg_filename, vocab_filename):
		start = time.clock()
		configuration = Utils.parse_classification_configuration(cfg_filename)
		text_chunks, labels = Utils.divide_into_chunks(configuration)

		countries = [ for entry in configuration]

		dictionary = {}
		words_list = Utils.load_words_list(vocab_filename)
		for country, chunk in zip(countries, text_chunks):
			dcountry = {}
			dist = FreqDist(chunk.split())
			for word in words_list:
				dcountry[word] = dist[word]
			# end for
			dictionary[country] = dcountry
		# end for

		with open('vocab.countries.pkl', 'wb') as fout:
			pickle.dump(dictionary, fout, pickle.HIGHEST_PROTOCOL)
		# end with

		print('time:', '{0:.3f}'.format(time.clock() - start))
	# end def
github walshbr / nltk / ch_five / View on Github external
def display():
    import pylab

    # pulls in a frequency distribution of all the words in the news category
    word_freqs = nltk.FreqDist(brown.words(categories='news')).most_common()
    # sequentially orders the words by frequency
    words_by_freq = [w for (w, _) in word_freqs]
    # makes a cfd based on the words and the frequency of their tags
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))

    # returns a list of evenly spaced numbers from 1 to two to the power of fifteen
    sizes = 2 ** pylab.arange(15)

    # for every size in that evenly spaced array, evaluate a baseline tagger based on a training set of that size. so it's plotting training models that get larger and larger.
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')

    # sets all of the axes
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
github aweiand / TwitterSentiment / View on Github external
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
github mitmedialab / DataBasic / databasic / logic / View on Github external
def term_frequency(words):
    Turn a list of words into a NLTK frequency distribution object
    t1 = time.time()
    fdist = FreqDist(words)
    # remove stopwords here rather than in corpus text for speed
    for w in list(fdist):
        if w in stopwords.words('english'):
            del fdist[w]
    t2 = time.time()
    logging.debug("   create term freq: %d" % (t2-t1))
    return fdist
github nreimers / truecaser / truecaser / View on Github external
def __init__(self):
        self.uniDist = nltk.FreqDist()
        self.backwardBiDist = nltk.FreqDist() 
        self.forwardBiDist = nltk.FreqDist() 
        self.trigramDist = nltk.FreqDist() 
        self.wordCasingLookup = {}
        self.title_case_unknown_tokens = True
github sujitpal / nltk-examples / src / book / View on Github external
def ch05_10_train_test_unigram_tagger():
  from nltk.corpus import brown
  fd = nltk.FreqDist(brown.words(categories="news"))
  cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
  most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
  unigram_tagger = nltk.UnigramTagger(model=most_freq_pos)
  for sent in brown.sents(categories="editorial")[:10]:
    tagged = unigram_tagger.tag(sent)
    print sent
    print ">>>", tagged
    print "not tagged: ", filter(lambda (a,b): b == None, tagged)
github DUanalytics / pyAnalytics / 87-TM / View on Github external
wordcloud3 = wc.generate_from_frequencies(words)

# from text document
import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt"webtext")
import os
wt_words = webtext.words('E:/pywork/pyprojects/pyanalytics19/data/testing.txt') #sample data
data_analysis = nltk.FreqDist(wt_words)
filter_words = dict([(m,n) for m, n in data_analysis.items() if len(m) > 3])
wcloud = WordCloud().generate_from_frequencies(filter_words)

plt.imshow(wcloud, interpolation ='bilinear')
github sujitpal / nltk-examples / src / book / View on Github external
def ex12():
  from nltk.corpus import cmudict
  entries = cmudict.entries()
  words = map(lambda (word, pron) : word, entries)
  distinct_words = set(words)
  fd = nltk.FreqDist(words)
  multi_prons = 0
  for key in fd.keys():
    if fd[key] == 1:
    multi_prons = multi_prons + 1
  print "#-distinct words:", len(distinct_words)
  print "#-words with multiple prons:", multi_prons