How to use the nltk.tokenize function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github kachok / hitman / esl / src / controls.py View on Github external
def get_sentences(page_title):
	all_sents = []
	txt = wikipydia.query_text_rendered(page_title)
	parse = BeautifulSoup(txt['html'])
	justtext = parse.get_text()
	#justtext = justtext.encode('utf-8')
	tok = nltk.tokenize.PunktSentenceTokenizer()
	sents0 = tok.tokenize(justtext)
	chunker = TagChunker(treebank_chunker())
	i = 0
	for s0 in sents0:
		i += 1
		sents = s0.split('\n')
		for s in sents:
			verbfound = False
			nounfound = False
			ss = s.split()
			if(len(ss) > 0):
				tree = chunker.parse(nltk.pos_tag(ss))
				for tag in [p[1] for p in tree.leaves()]:
					if(tag[0] == 'V'):
						verbfound = True
						break
github codelucas / newspaper / newspaper / text.py View on Github external
def candidate_words(self, stripped_input):
        import nltk
        s = nltk.stem.isri.ISRIStemmer()
        words = []
        for word in nltk.tokenize.wordpunct_tokenize(stripped_input):
            words.append(s.stem(word))
        return words
github nltk / nltk / nltk / corpora / genesis.py View on Github external
def raw(files = 'english-kjv'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file+".txt")
        f = open_corpus(path)
        for t in tokenize.whitespace(f.read()):
            yield t
github satyarth / supa-bot-fire / supabotfire.py View on Github external
def supa_bot_fire(text, screen_name): # I parse that
    message = ""
    if any(banned_string in text for banned_string in banned_strings):
        return ""
    else:
        try:
            text = sentence_detector.tokenize(text.strip())[0]
            tag_list = nltk.pos_tag(nltk.tokenize.word_tokenize(text))
            if tag_list[0][1] == 'PRP' \
            and tag_list[1][1] in verb_forms \
            and not tag_list[0][0].lower() in banned_pronouns \
            and not tag_list[1][0].lower() in banned_verbs \
            and not tag_list[2][0] in ["n\'t", "ta"] \
            and not any("CC" == tag[1] for tag in tag_list):
                for tag in tag_list[2:-1]:
                    if any(string == tag[0] for string in no_pre_space):
                        message = message.strip() + tag[0] + " "
                    elif any(string == tag[0] for string in no_post_space):
                        message += tag[0]
                    else:
                        message += tag[0] + " "
                if tag_list[-1][0] not in [',', '.', '!', '?']:
                    message += tag_list[-1][0]
                message = message.strip()
github dwzhou / SentimentAnalysis / src / AnewSentimentAnalysis.py View on Github external
"""
    output_file = os.path.join(output_dir, "Output Anew Sentiment " + os.path.basename(input_file).rstrip('.txt') + ".csv")

    # read file into string
    with open(input_file, 'r') as myfile:
        fulltext = myfile.read()
    # end method if file is empty
    if len(fulltext) < 1:
        print('Empty file.')
        return

    from nltk.stem.wordnet import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()

    # otherwise, split into sentences
    sentences = tokenize.sent_tokenize(fulltext)
    i = 1 # to store sentence index
    # check each word in sentence for sentiment and write to output_file
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Sentence ID', 'Sentence', 'Sentiment', 'Sentiment Label', 'Arousal', 'Dominance',
                      '# Words Found', 'Found Words', 'All Words']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # analyze each sentence for sentiment
        for s in sentences:
            # print("S" + str(i) +": " + s)
            all_words = []
            found_words = []
            total_words = 0
            v_list = []  # holds valence scores
            a_list = []  # holds arousal scores
github gabrielspmoreira / chameleon_recsys / acr_module / acr / preprocessing / w2v_tfidf_adressa.py View on Github external
def tokenize_norwegian_article(text, first_sentences=12, max_words_length=1000):
    #Removing pipes for correct sentence tokenization
    text = text.replace('|', '.')
    words_tokenized = []
    sent_count = 0
    for sentence in nltk.tokenize.sent_tokenize(text, language='norwegian'):        
        sent_tokenized = nltk.tokenize.word_tokenize(sentence, language='norwegian')
        if len(sent_tokenized) >= 3 and sent_tokenized[-1] in ['.', '!', '?', ';'] and \
           sent_tokenized != ['Saken', 'oppdateres', '.']:                
            sent_count += 1
            words_tokenized.extend(sent_tokenized)        
            if sent_count == first_sentences:
                break
    return words_tokenized[:max_words_length]
github gooofy / zamia-ai / speech_sentences.py View on Github external
punkt_trainer = nltk.tokenize.punkt.PunktTrainer()

    punkt_count = 0

    parole_crawl (parole, train_punkt)

    print
    print "Finalizing training..."
    punkt_trainer.finalize_training(verbose=True)
    print "Training done. %d text segments." % punkt_count
    print

    params = punkt_trainer.get_params()
    # print "Params: %s" % repr(params)

    tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params)
    with open(PUNKT_PICKLEFN, mode='wb') as f:
            pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

    print '%s written.' % PUNKT_PICKLEFN

else:

    print "Loading %s ..." % PUNKT_PICKLEFN

    with open(PUNKT_PICKLEFN, mode='rb') as f:
        tokenizer = pickle.load(f)

    print "Loading %s ... done." % PUNKT_PICKLEFN

with codecs.open(SENTENCEFN, 'w', 'utf8') as outf:
github diging / tethne / tethne / readers / zotero.py View on Github external
Returns
    -------
    :class:`.StructuredFeature`
        A :class:`.StructuredFeature` that contains sentence context.
    """
    with codecs.open(fpath, 'r') as f:  # Determine the encoding of the file.
        document = f.read()
    encoding = chardet.detect(document)['encoding']
    document = document.decode(encoding)

    tokens = []
    sentences = []

    i = 0
    for sentence in nltk.tokenize.sent_tokenize(document):
        sentences.append(i)

        for word in nltk.tokenize.word_tokenize(sentence):
            tokens.append(word)
            i += 1

    contexts = [('sentence', sentences)]
    return StructuredFeature(tokens, contexts)
github nltk / nltk / nltk / corpora / inaugural.py View on Github external
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "inaugural", file + ".txt")
        f = open_corpus(path)
        text = f.read()
        for t in tokenize.wordpunct(text):
            yield t
github fastnlp / fastNLP / reproduction / HAN-document_classification / preprocess.py View on Github external
if (i+1) % 5000 == 0:
        print(i)
        pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
        j += 1
        samples = []
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
# print(samples[0])


for fn in os.listdir(in_dirname):
    print(fn)
    precessed = []
    for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
        tokens = []
        sents = nltk.tokenize.sent_tokenize(text)
        for s in sents:
            tokens.append(tokenizer.tokenize(s))
        precessed.append((stars, tokens))
        # print(tokens)
        if len(precessed) % 100 == 0:
            print(len(precessed))
    pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))