How to use the nltk.corpus.stopwords function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github nfmcclure / tensorflow_cookbook / 07_Natural_Language_Processing / 06_Using_Word2Vec_Embeddings / 06_using_word2vec.py View on Github external
from tensorflow.python.framework import ops
ops.reset_default_graph()

os.chdir(os.path.dirname(os.path.realpath(__file__)))

# Start a graph session
sess = tf.Session()

# Declare model parameters
embedding_size = 200
vocabulary_size = 2000
batch_size = 100
max_words = 100

# Declare stop words
stops = stopwords.words('english')

# Load Data
print('Loading Data')
texts, target = text_helpers.load_movie_data()

# Normalize text
print('Normalizing Text Data')
texts = text_helpers.normalize_text(texts, stops)

# Texts must contain at least 3 words
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

# Split up data set into train/test
train_indices = np.random.choice(len(target), round(0.8*len(target)), replace=False)
test_indices = np.array(list(set(range(len(target))) - set(train_indices)))
github senya-ashukha / bigram-anchor-words / ad_hock / text_prep.py View on Github external
import nltk, os
from collections import Counter
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

porter = nltk.PorterStemmer()

en = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
digit, space = '0123456789', ' \t\n'
good_symbol = en + en.lower() + digit + space


def filter_word(word):
    return u''.join(filter(lambda char: char in good_symbol, list(word)))


def filter_bw(text):
    text = text.split()
    text = filter(lambda x: len(x) > 2, text)
    text = filter(lambda x: not x.isdigit(), text)
github yftah89 / Neural-SCL-Domain-Adaptation / AE-SCL-SR / w2v.py View on Github external
from nltk.corpus import stopwords
import csv
import gensim
import logging
import Cython
import os
from gensim import models
from gensim.models import word2vec
import re
import xml.etree.ElementTree as ET
import pickle


punctuations = [")", "(", "''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?", "!", ",", ":", "-", "--",
                "...", ";"]
stops = stopwords.words('english')
encoding_ = 'utf-8'
def getClear_full(sentence):
    r = re.findall(r'\b\w+\b', sentence.lower())
    r = " ".join(r)
    return r

def getClear(sentence,bigram):
    r = re.findall(r'\b\w+\b', sentence.lower())
    length=len(r)

    i=0
    while(i
github HouJP / kaggle-quora-question-pairs / houjp / bin / feature.py View on Github external
import numpy as np
from os import listdir
from os.path import isfile, join
import re
import hashlib

from utils import DataUtil


class Feature(object):
    '''
    特征工程工具
    '''

    # 停用词
    stops = set(stopwords.words("english"))
    # train.csv中IDF字典
    train_idf = {}

    def __init__(self):
        return

    @staticmethod
    def load_npz(ft_fp):
        loader = np.load('%s.npz' % ft_fp)
        features = csr_matrix((loader['data'],
                               loader['indices'],
                               loader['indptr']),
                              shape=loader['shape'])
        LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp)
        return features
github rupak-118 / Quora-Question-Pairs / MaLSTM_train.py View on Github external
vocabulary = dict()
inverse_vocabulary = ['']  # '' acts as a placeholder for the zero vector embedding

qs = pd.DataFrame({'q1': q1, 'q2': q2})
questions_cols = ['q1', 'q2']

# Iterate through the text of both questions of each pair
from tqdm import tqdm
for index, row in tqdm(qs.iterrows()):
    
    for question in questions_cols:
        
        q2n = []  # q2n -> numerical vector representation of each question
        for word in row[question]:
            # Check for stopwords who do not have a word2vec mapping and ignore them
            if word in set(stopwords.words('english')) and word not in word2vec_model.vocab:
                continue

            if word not in vocabulary:
                vocabulary[word] = len(inverse_vocabulary)
                q2n.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                q2n.append(vocabulary[word])

        # Replace questions with equivalent numerical vector/ word-indices
        qs.set_value(index, question, q2n)
    

# Prepare embedding layer
embedding_dim = 300
embeddings = np.random.randn(len(vocabulary)+1, embedding_dim) # Embedding matrix
github GajjarMihir / Topic-Based-Question-Generation / preprocessing.py View on Github external
# Preprocessing the data.

import json					# For the json data.
import gzip					# Using gzip because the file is in .gz format.
import os					# For listing all the files from the directory.

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

path = 'Add the path to the data here'		# The path containing the data.

files = os.listdir(path)	# stores all the files in the directory from the path into a list.

Questions = []				# For storing the questions.
Answers = []				# For storing the answers for a particular question.
All_Answers = []			# For storing the answers of all the questions.
max_question_word_length = 50	# Maximum word length of the question.

# Gives a generator object of the file in the dataset.
def parse(path):			
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)
github UKPLab / lrec2018-live-blog-corpus / summarize / upper_bound.py View on Github external
def __init__(self, language):
        self.sentences = []
        self.docs = []
        self.models = []
        self.doc_sent_dict = {}
        self.ref_ngrams = []
        self.LANGUAGE = language
        self.stemmer = SnowballStemmer(self.LANGUAGE)
        self.stoplist = set(stopwords.words(self.LANGUAGE))
github iamprem / summarizer / dataprep.py View on Github external
def clean_vertex(line):
    """ Take review id and sentence tuple and clean it to (review_id, word list) """
    rev_id, sent = line[0], line[1]
    lmtz = WordNetLemmatizer()
    sw = stopwords.words('english')
    words = re.findall(r'[a-zA-Z]+', sent)
    words = [lmtz.lemmatize(w.lower()) for w in words if w.lower() not in sw]
    words = [w for w in words if len(w) > 3]
    return rev_id, words
github cvdigitalai / katecheo / kb-search / KBSearch.py View on Github external
def clean_text_and_remove_stopwords(self, text):

        # Remove punctuations
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

        # Remove unnecessary white space.
        text = re.sub(r'\s+', ' ', text).strip()

        # Convert every word to lowercase.
        text = text.lower()

        # Tokenize each word and remove common stop words in English.
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_tokens = [w for w in word_tokens if not w in stop_words]
        text = " ".join(filtered_tokens)

        return text
github Ashton-Sidhu / aethos / aethos / preprocessing / text.py View on Github external
if lower:
        corpus = corpus.lower()

    for token in word_tokenize(corpus):

        if punctuation:
            if token in string.punctuation:
                continue

            token = token.translate(str.maketrans("", "", string.punctuation))

        if numbers:
            token = token.translate(str.maketrans("", "", "0123456789"))

        if stopwords:
            stop_words = nltk.corpus.stopwords.words("english")
            if token in stop_words:
                continue

        if stemmer:
            stem = SnowballStemmer("english")
            token = stem.stem(token)

        transformed_corpus += token + " "

    return transformed_corpus.strip()