How to use the nltk.data.load function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dheeraj7596 / SCDV / Reuters / Word2Vec.py View on Github external
import logging
from gensim.models import Word2Vec
from KaggleWord2VecUtility import KaggleWord2VecUtility
import time
import sys
import csv

if __name__ == '__main__':

    start = time.time()
    # The csv file might contain very huge fields, therefore set the field_size_limit to maximum.
    csv.field_size_limit(sys.maxsize)
    # Read train data.
    train_word_vector = pd.read_pickle('all.pkl')
    # Use the NLTK tokenizer to split the paragraph into sentences.
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = []
    print("Parsing sentences from training set...")

    # Loop over each news article.
    for review in train_word_vector["text"]:
        try:
            # Split a review into parsed sentences.
            sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer, remove_stopwords=True)
        except:
            continue

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \
                        level=logging.INFO)

    num_features = int(sys.argv[1])  # Word vector dimensionality
    min_word_count = 20  # Minimum word count
github UKPLab / acl2017-interactive_summarizer / summarizer / algorithms / upper_bound_ilp.py View on Github external
#nltk.download('stopwords')

from nltk.data import load as LPickle

import sys, os.path as path
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))

from summarizer.utils.data_helpers import extract_ngrams2, prune_ngrams, untokenize
from summarizer.algorithms.base import Sentence
from _summarizer import Summarizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

sent_detector = LPickle('tokenizers/punkt/english.pickle')

class ExtractiveUpperbound(Summarizer):
    def __init__(self, language):
        self.sentences = []
        self.docs = []
        self.models = []
        self.doc_sent_dict = {}
        self.ref_ngrams = []
        self.LANGUAGE = language
        self.stemmer = SnowballStemmer(self.LANGUAGE)
        self.stoplist = set(stopwords.words(self.LANGUAGE)) 

    def __call__(self, docs, models, length, ngram_type=2):
        self.sum_length = int(length)
        self.load_data(docs, models)
        self.get_ref_ngrams(ngram_type)
github after12am / summary / summary / topic / keygragh.py View on Github external
def divide_into_senteces(self, cache = True):
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = tokenizer.tokenize(self.text)
        # remove period from end of sentence
        return [re.sub(r'\.$', '', sentence) for sentence in sentences]
github gabrielspmoreira / chameleon_recsys / acr_module / acr / preprocessing / acr_preprocess_gcom.py View on Github external
string = re_quotes_3.sub('"', string)
    string = re.sub('"', '', string)
    string = re_dots.sub('.', string)
    string = re_punctuation.sub(r'\1', string)
    string = re_hiphen.sub(' - ', string)
    string = re_punkts.sub(r'\1 \2 \3', string)
    string = re_punkts_b.sub(r'\1 \2 \3', string)
    string = re_punkts_c.sub(r'\1 \2', string)
    string = re_doublequotes_1.sub('\"', string)
    string = re_doublequotes_2.sub('\'', string)
    string = re_trim.sub(' ', string)
        
    return string.strip()


sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
def clean_and_filter_first_sentences(string, first_sentences=8):
    # Tokenize sentences and remove short and malformed sentences.
    sentences = []
    for sent in sent_tokenizer.tokenize(string):
        if sent.count(' ') >= 3 and sent[-1] in ['.', '!', '?', ';']:
            sentences.append(clean_str(sent))
            if len(sentences) == first_sentences:
                break
    return ' '.join(sentences)

#############################################################################################

def load_input_csv(path):
    news_df = pd.read_csv(path, encoding = 'utf-8')

    #Concatenating all available text
github Hurence / logisland / logisland-plugins / logisland-scripting-plugin / src / main / resources / nltk / sentiment / util.py View on Github external
def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text))
github facebookresearch / ParlAI / projects / wizard_of_wikipedia / knowledge_retriever / knowledge_retriever.py View on Github external
def _set_up_sent_tok(self):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path)
github loyalzc / nlp_kesci / essay_scoring_ml.py View on Github external
def data_cleaning(data):
    print('---data_cleaning start...')
    # 分词
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    data["words"] = data["essay"].apply(tokenizer.tokenize)
    # 分句
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    data['sents'] = data["essay"].apply(sent_detector.tokenize)
    # 分字母,求得长度
    data['character_count'] = data['words'].apply(lambda x: len(''.join(x)))
    # 分词的tag(tag)
    data['tags'] = data['words'].apply(pos_tag)
    print('---data_cleaning end...')
    return data
github samirsen / image-generator / skipthoughts.py View on Github external
def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X
github john-kurkowski / Kilgore-Trout / kilgoretrout / extract.py View on Github external
def tag_nes(cls, tokenized_sents):
        tagger_url = 'nltk:taggers/maxent_treebank_pos_tagger/english.pickle'
        tagger = nltk.data.load(tagger_url)
        tagged = tagger.batch_tag(tokenized_sents)
    
        ne_chunker_url = 'nltk:chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
        ne_chunker = nltk.data.load(ne_chunker_url)
        nes = ne_chunker.batch_parse(tagged)
        return nes
github osma / annif / autoindex.py View on Github external
FINNISH = re.compile(r'\b(ja|joka|oli|kuin|jossa|jotka|jonka)\b')
SWEDISH = re.compile(r'\b(och|med|som|att|den|det|eller|av)\b')
ENGLISH = re.compile(r'\b(and|of|for|at|the)\b')

def is_in_language(targetlang, text):
    # Quick and dirty regex shortcuts for detecting the most common languages
    if FINNISH.search(text) is not None:
        return (targetlang == 'fi')
    if SWEDISH.search(text) is not None:
        return (targetlang == 'sv')
    if ENGLISH.search(text) is not None:
        return (targetlang == 'en')
    # assume it's the right language
    return True

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 
def split_to_sentences(text, targetlang):
    sentences = []
    for sentence in sentence_tokenizer.tokenize(text):
        if not is_in_language(targetlang, sentence):
            continue
        sentences.append(sentence)
    return sentences

@functools.lru_cache(maxsize=100000)
def search(text, proj, cutoff_frequency):
    es = Elasticsearch()

    query = {
        'query': {
            'function_score': {
                'query': {