How to use the nltk.corpus function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openstates / openstates / fiftystates / site / search / View on Github external
def keywordize(str):
    Splits a string into words, removes common stopwords, stems and removes
    sents = nltk.tokenize.sent_tokenize(str)

    words = []
    for sent in sents:

    stemmer = nltk.stem.porter.PorterStemmer()
    stop_words = nltk.corpus.stopwords.words()
    words = [stemmer.stem(word.lower()) for word in words if
             (word.isalpha() or word.isdigit()) and
             word.lower() not in stop_words]
    words = set(words)

    return words
github zermelozf / esn-lm / examples / View on Github external
from esnlm.readouts import *
import random

print "... loading text"
#with open('./../datasets/t5_train') as f:
#    text_train =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_train)
#    text_train = (' . '.join(text_train)).split(' ')
#with open('./../datasets/t5_test') as f:
#    text_test =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_test)
#    text_test = (' . '.join(text_test)).split(' ')

import nltk
text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt'))
text_test = text_train
vocabulary = list(set(text_train))

### Transform text into labels
utrain = [vocabulary.index(w) for w in text_train[:-1]]
ytrain = [vocabulary.index(w) for w in text_train[1:]]

utest = [vocabulary.index(w) for w in text_test[:-1]]
ytest = [vocabulary.index(w) for w in text_test[1:]]

print "... building model"
### Hyperparameters
input_dim = output_dim = len(vocabulary)
features_dim, reservoir_dim = 2, 25
spectral_radius = 0.97
github sbadecker / keyword_ranker / keyword_ranker / View on Github external
# has been extended by sbadecker to support lemmatization using
# WordNetLemmatizer from NLTK.

from __future__ import absolute_import
from __future__ import print_function
import re
import operator
import six
from six.moves import range
from nltk.stem import WordNetLemmatizer
import nltk

    _ = nltk.corpus.wordnet
except Exception:'wordnet')

def is_number(s):
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False

def load_stopwords(stopword_file):
    Utility function to load stop words from a file and return as a list of
github bugraoral / TextRank / View on Github external
def score_keyphrases_by_textrank(text, n_keywords=0.05):
    from itertools import takewhile, tee
    import networkx, nltk

    stop_words = set(nltk.corpus.stopwords.words('turkish'))

    # tokenize for all words, and extract *candidate* words
    words = [word.lower()
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(removePunc(sent,' ').strip())
             if len(word) > 2 and word.lower() not in stop_words]

    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    # iterate over word-pairs, add unweighted edges into graph

    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
github pthomas1 / MLBlog / View on Github external
def compare_stemming_to_lemmatization():

    # load each of the corpora
    abc_words =
    genesis_words = nltk.corpus.genesis.words()
    gutenberg_words = nltk.corpus.gutenberg.words()
    inaugural_words = nltk.corpus.inaugural.words()
    state_union_words = nltk.corpus.state_union.words()
    webtext_words = nltk.corpus.webtext.words()

    all_words = [abc_words, genesis_words, gutenberg_words, inaugural_words,
                 state_union_words, webtext_words]
    corpora_names =["ABC", "Genesis", "Gutenberg", "Inaugural",
                    "Union", "Web"];

    word_counts = []
    lemmatized_counts = []
    stemmed_counts = []

    # iterate through each corpus and generate counts of the unique tokens
    # in each
github bjascob / LemmInflect / scripts / 04_BuildOverrides / View on Github external
# This script creates an overrides file that allows the system to overcome issues with
# the way Spacy lemmatizes words and invalid data in the AGID.
# The created file is a mapping from lemma/tag to the "best" inflection.  Note that
# this only overrides methods where the treebank tag is used, not ones where the
# simplified AGID tag (V, N or A) is used.
# Note that if the AGID version is changed this script should be re-run.  Additionally
# if Spacy changes their lemmatizer or if a different lemmatizer is used consider re-running
# this script.
if __name__ == '__main__':
    level  = logging.WARNING
    format = '[%(levelname)s %(filename)s ln=%(lineno)s] %(message)s'
    logging.basicConfig(level=level, format=format)

    # Configuration
    #corp_fns  = ['austen-emma.txt']                # 7,491 sentences
    corp_fns  = nltk.corpus.gutenberg.fileids()     # 18 files with 94K sentences
    max_chars = int(1e9)
    req_count = 4       # require at least the many instances in corpus for an override
    lemminflect.setUseInternalLemmatizer(True)      # use lemminflect or spaCy's lemmatizer
    inflect_oov = True                              # test/inflect out-of-vocab words
    multiples_fn = 'CorpMultiInfls.txt'

    # Load Spacy
    print('Loading Spacy model')
    nlp = spacy.load('en_core_web_sm')
    print('Using spaCy version ', spacy.__version__)

     # Load the corpus to test with
    print('Loading corpus')
    sents = []
    for corp_fn in corp_fns:
        sents += loadNLTKCorpus(corp_fn, max_chars)
github JohannesBuchner / languagecheck / View on Github external
def a_or_an_words(paragraphs):
	with + '_a.html', 'w', 'latin1') as f:
		f.write(header % dict(title='a or an'))
		f.write("""<h1>a or an?</h1>
		The rule is that <em>a</em> is used before a word starting with a 
		consonant (a house, a unicorn), and <em>an</em> before a vowel 
		(an ox, an hour). Here we check whether the following word is a vowel or consonant.
		<style type="text/css">
		.evaluation{font-family: monospace; color: gray;}
		from collections import defaultdict
		firstsyll = defaultdict(list)
		for word, syl in nltk.corpus.cmudict.entries():
		nfound = 0
		nwrong = 0
		for para in paragraphs:
			for txt, tags, entities in para:
				for i, (word, _wordtype) in enumerate(tags):
					if word not in ('a', 'an'):
					expect_vowel = word == 'an'
					if i + 1 == len(tags):
						# no word after a/an.
					nextword, _wordtype2 = tags[i+1]
					if nextword.isupper():</ul>
github hotokokoa / social-media-mining / Code / View on Github external
def process(words):
  word_dict = {}
  result = ""
  real_words = set(nltk.corpus.words.words())
  for w in words:
    if w not in stop_words.ENGLISH_STOP_WORDS and w in real_words and len(w)&gt;1:
      # print(w)
      result = result + w + " "
      if w in word_dict:
        word_dict[w] += 1
        word_dict[w] = 1
  sorted_word_dict = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)
  res = {}
  with open("t.out","w") as f:
    tmp = 0
    for items in sorted_word_dict:
      if int(items[1]) &lt; 50:
      # if tmp&gt;200:
github acrosson / nlp / subject_extraction / View on Github external
def trained_tagger(existing=False):
    """Returns a trained trigram tagger

    existing : set to True if already trained tagger has been pickled
    if existing:
        trigram_tagger = pickle.load(open('trained_tagger.pkl', 'rb'))
        return trigram_tagger

    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    # Create instance of SubjectTrigramTagger and persist instance of it
    trigram_tagger = SubjectTrigramTagger(train_sents)
    pickle.dump(trigram_tagger, open('trained_tagger.pkl', 'wb'))

    return trigram_tagger
github sdockray / sma / sma / View on Github external
	for k in a.posts:

	print(str(len(titles)) + ' titles')
	print(str(len(links)) + ' links')
	print(str(len(synopses)) + ' synopses')

	ranks = []
	for i in range(0,len(titles)):
	# load nltk's English stopwords as variable called 'stopwords'
	stopwords = nltk.corpus.stopwords.words('english')
	# load nltk's SnowballStemmer as variabled 'stemmer'
	from nltk.stem.snowball import SnowballStemmer
	stemmer = SnowballStemmer("english")

	totalvocab_stemmed = []
	totalvocab_tokenized = []
	for i in synopses:
		allwords_stemmed = tokenize_and_stem(i)
		allwords_tokenized = tokenize_only(i)

	vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
	from sklearn.feature_extraction.text import TfidfVectorizer