How to use the nltk.corpus.stopwords.words function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hoaxanalyzer / hoax-search-vote / extractor / View on Github external
def preprocess(text):
    text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
    text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
    tokens = text.split(" ")
    result = ""
    for token in tokens:
        word = token.split(" ")[0]
        if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
            if len(word) > 0:
                if word.isupper() and dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                elif word.isupper():
                    result += token.title() + " "
                elif dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                    result += token + " "
github hoaxanalyzer / hoax-search-vote / backup / View on Github external
def create_dic(self, documents):	
		texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
		         for document in documents]

		from collections import defaultdict
		frequency = defaultdict(int)
		for text in texts:
		    for token in text:
		        frequency[token] += 1
		texts = [[token for token in text if frequency[token] > 1]
		         for text in texts]

		dictionary = corpora.Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		return [dictionary, corpus]
github ParasAvkirkar / -Competitive-Coding-Problem-Classifier-and-Recommender / Data Extraction / codechef / View on Github external
def create_word_features(self, words):
		# print words
		w = []
		for line in words:
			for wrd in line.split():
		useful_words = [word for word in w if word not in
		my_dict = ' '.join([word for word in useful_words])
		# print my_dict
		return my_dict
github HsuWanTing / unified-summarization / old_code / sentence-selector-pg / View on Github external
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
    #print 'inertia: ', km_model.inertia_
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
    return clustering
github chenyangh / DialogueGenerationWithEmotion / View on Github external
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
github pratik008 / HealthCare_Twitter_Analysis / Archive 1 / consolidated_scripts / cleanup_scripts / View on Github external
def isStopword(text):
    filtered_words = []
    splitsent = text.split(' ')
    for w in splitsent:
        if w in  stopwords.words('english'):
            return 'Y'
            return 'N'
github eea / eea.corpus / src / eea.corpus / eea / corpus / processing / View on Github external
def process(content, env, **settings):
    stops = stopwords.words('english')

    for doc in content:

        words = word_tokenize(doc['text'])
        text = [w for w in words if w not in stops]
        text = " ".join(text)

        yield set_text(doc, text)
github alvations / stasis / MegaEXPERT-2016 / View on Github external
import graphlab as gl
from graphlab.toolkits.distances import cosine
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

    import cPickle as pickle
    import pickle

punct = string.punctuation
sts = gl.SFrame('')
stoplist = set(stopwords.words('english'))

def get_embeddings(embedding_gzip, size):
    headers = ['word'] + ['d'+str(i) for i in range(1, size+1)]
    coltypes = [str] + [float] * size
    sf = gl.SFrame.read_csv( embedding_gzip, delimiter=' ', column_type_hints=coltypes, header=False, quote_char='\0')
    sf = sf.pack_columns(['X'+str(i) for i in range(2, size+1)])
    df = sf.to_dataframe().set_index('X1')
    column_names = list(df)
    return df.to_dict(orient='dict')[column_names[1]]

#content_vocab = set(i.strip() for i in open('sts_vocab.txt', 'r').readlines())

def get_vector(word, embeddings):
    return np.array(embeddings[word])
github pdpipe / pdpipe / pdpipe / View on Github external
def __stopwords_by_language(language):
            from nltk.corpus import stopwords
            return stopwords.words(language)
        except LookupError:  # pragma: no cover
            # try:
            # except LookupError:  # pragma: no cover
            dpath = os.path.expanduser('~/nltk_data/corpora/stopwords')
            os.makedirs(dpath, exist_ok=True)
            from nltk.corpus import stopwords
            return stopwords.words(language)
github NLPatVCU / medinify / medinify / sentiment / View on Github external
def preprocess(self, reviews_filename):
        Transforms reviews (comments and ratings) into numerical representations (vectors)
        Comments are vectorized into bag-of-words representation
        Ratings are transformed into 0's (negative) and 1's (positive)
        Neutral reviews are discarded
        :param reviews_filename: CSV file with comments and ratings
        data: list of sparse matrices
            vectorized comments
        target: list of integers
            vectorized ratings

        stop_words = set(stopwords.words('english'))
        sp = spacy.load('en_core_web_sm')

        df = pd.read_csv(reviews_filename)
        raw_data, raw_target = [], []

        for review in df.itertuples():

            if type(review.comment) == float:
            comment = {token.text: True for token in sp.tokenizer(review.comment.lower()) if token.text
                       not in stop_words}

            if self.numclasses == 2:
                rating = 'pos'
                if review.rating == 3: