How to use the nltk.corpus.stopwords.words function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hoaxanalyzer / hoax-search-vote / extractor / preprocessor.py View on Github external
def preprocess(text):
    text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
    text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
    tokens = text.split(" ")
    result = ""
    for token in tokens:
        word = token.split(" ")[0]
        if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
            if len(word) > 0:
                if word.isupper() and dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                elif word.isupper():
                    result += token.title() + " "
                elif dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                else:
                    result += token + " "
            else:
github hoaxanalyzer / hoax-search-vote / backup / newtfidfold.py View on Github external
def create_dic(self, documents):	
		texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
		         for document in documents]

		from collections import defaultdict
		frequency = defaultdict(int)
		for text in texts:
		    for token in text:
		        frequency[token] += 1
		texts = [[token for token in text if frequency[token] > 1]
		         for text in texts]

		dictionary = corpora.Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		return [dictionary, corpus]
github ParasAvkirkar / -Competitive-Coding-Problem-Classifier-and-Recommender / Data Extraction / codechef / problems.py View on Github external
def create_word_features(self, words):
		# print words
		w = []
		for line in words:
			for wrd in line.split():
				w.append(wrd)
		useful_words = [word for word in w if word not in
                        stopwords.words('english')]
		my_dict = ' '.join([word for word in useful_words])
		# print my_dict
		return my_dict
github HsuWanTing / unified-summarization / old_code / sentence-selector-pg / cluster.py View on Github external
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=1.0,
                                 min_df=1,
                                 lowercase=True)

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
    km_model.fit(tfidf_model)
    #print 'inertia: ', km_model.inertia_
    #pdb.set_trace()
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering
github chenyangh / DialogueGenerationWithEmotion / emotion_tagger_keras.py View on Github external
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
github pratik008 / HealthCare_Twitter_Analysis / Archive 1 / consolidated_scripts / cleanup_scripts / textProc.py View on Github external
def isStopword(text):
    filtered_words = []
    splitsent = text.split(' ')
    for w in splitsent:
        if w in  stopwords.words('english'):
            return 'Y'
        else:
            return 'N'
github eea / eea.corpus / src / eea.corpus / eea / corpus / processing / stopwords.py View on Github external
def process(content, env, **settings):
    stops = stopwords.words('english')

    for doc in content:

        words = word_tokenize(doc['text'])
        text = [w for w in words if w not in stops]
        text = " ".join(text)

        yield set_text(doc, text)
github alvations / stasis / MegaEXPERT-2016 / sts_glove.py View on Github external
import graphlab as gl
from graphlab.toolkits.distances import cosine
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

try:
    import cPickle as pickle
except:
    import pickle


punct = string.punctuation
sts = gl.SFrame('sts_all.gl/')
stoplist = set(stopwords.words('english'))

def get_embeddings(embedding_gzip, size):
    headers = ['word'] + ['d'+str(i) for i in range(1, size+1)]
    coltypes = [str] + [float] * size
    sf = gl.SFrame.read_csv( embedding_gzip, delimiter=' ', column_type_hints=coltypes, header=False, quote_char='\0')
    sf = sf.pack_columns(['X'+str(i) for i in range(2, size+1)])
    df = sf.to_dataframe().set_index('X1')
    column_names = list(df)
    return df.to_dict(orient='dict')[column_names[1]]


#content_vocab = set(i.strip() for i in open('sts_vocab.txt', 'r').readlines())

def get_vector(word, embeddings):
    return np.array(embeddings[word])
github pdpipe / pdpipe / pdpipe / nltk_stages.py View on Github external
def __stopwords_by_language(language):
        try:
            from nltk.corpus import stopwords
            return stopwords.words(language)
        except LookupError:  # pragma: no cover
            # try:
            #     nltk.data.find('corpora/stopwords')
            # except LookupError:  # pragma: no cover
            dpath = os.path.expanduser('~/nltk_data/corpora/stopwords')
            os.makedirs(dpath, exist_ok=True)
            nltk.download('stopwords')
            from nltk.corpus import stopwords
            return stopwords.words(language)
github NLPatVCU / medinify / medinify / sentiment / review_classifier.py View on Github external
def preprocess(self, reviews_filename):
        """
        Transforms reviews (comments and ratings) into numerical representations (vectors)
        Comments are vectorized into bag-of-words representation
        Ratings are transformed into 0's (negative) and 1's (positive)
        Neutral reviews are discarded
        :param reviews_filename: CSV file with comments and ratings
        :return:
        data: list of sparse matrices
            vectorized comments
        target: list of integers
            vectorized ratings
        """

        stop_words = set(stopwords.words('english'))
        sp = spacy.load('en_core_web_sm')

        df = pd.read_csv(reviews_filename)
        raw_data, raw_target = [], []

        for review in df.itertuples():

            if type(review.comment) == float:
                continue
            comment = {token.text: True for token in sp.tokenizer(review.comment.lower()) if token.text
                       not in stop_words}

            if self.numclasses == 2:
                rating = 'pos'
                if review.rating == 3:
                    continue