How to use the nltk.word_tokenize function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chen0040 / keras-question-and-answering-web-api / keras_question_and_answering_system / library / seq2seq_v2_glove.py View on Github external
def reply(self, paragraph, question):
        input_paragraph_seq = []
        input_question_seq = []
        input_paragraph_emb = []
        input_question_emb = []
        input_paragraph_text = paragraph.lower()
        input_question_text = question.lower()
        for word in nltk.word_tokenize(input_paragraph_text):
            if not in_white_list(word):
                continue
            emb = self.glove_model.encode_word(word)
            input_paragraph_emb.append(emb)
        for word in nltk.word_tokenize(input_question_text):
            if not in_white_list(word):
                continue
            emb = self.glove_model.encode_word(word)
            input_question_emb.append(emb)
        input_paragraph_seq.append(input_paragraph_emb)
        input_question_seq.append(input_question_emb)
        input_paragraph_seq = pad_sequences(input_paragraph_seq, self.max_encoder_paragraph_seq_length)
        input_question_seq = pad_sequences(input_question_seq, self.max_encoder_question_seq_length)
        states_value = self.encoder_model.predict([input_paragraph_seq, input_question_seq])
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_word2idx['START']] = 1
github alpoktem / movie2parallelDB / src / subsegment_movie.py View on Github external
def get_speaker_info_from_transcript(proscript, scriptfile):
	script_data_list, script_speaker_data, script_data = read_movie_transcript(scriptfile)

	index_segment = 0
	index_script = 0
	last_matched_script_index = 0
	while index_segment < proscript.get_no_of_segments() and index_script < len(script_data_list):
		curr_seg = proscript.segment_list[index_segment]
		entry_segment_list = nltk.word_tokenize(curr_seg.transcript.translate(PUNCTUATION_TRANS).lower())
		entry_script_list = script_data_list[index_script]

		#print("seg:%s"%entry_segment_list)
		#print("scr:%s"%entry_script_list)
		intersecting = get_list_intersection(entry_segment_list, entry_script_list)
		no_of_intersecting = len(intersecting)
		#print("%i/%i intersects"%(no_of_intersecting, len(entry_segment_list)))  
		meh = False
		if no_of_intersecting >= len(entry_segment_list) * SCRIPT_MATCH_THRESHOLD:
			#print("match")
			#print("seg(%i):%s\nscr(%i):%s"%(index_segment, curr_seg.transcript, index_script, script_data[index_script]))
			curr_seg.speaker_id = script_speaker_data[index_script]
			remove_list_from_list(entry_script_list, intersecting)
			script_data_list[index_script] = entry_script_list

			index_segment += 1
github stephenhky / PyShortTextCategorization / shorttext / classifiers / embed / autoencode / AutoencoderEmbedVecClassification.py View on Github external
def shorttext_to_embedvec(self, shorttext):
        """ Convert the short text into an averaged embedded vector representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, sums
        them up, and normalize the resulting vector. It returns the resulting vector
        that represents this short sentence.

        :param shorttext: a short sentence
        :return: an embedded vector that represents the short sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        vec = np.zeros(self.vecsize)
        tokens = word_tokenize(shorttext)
        for token in tokens:
            if token in self.wvmodel:
                vec += self.wvmodel[token]
        vec /= np.linalg.norm(vec)
        return vec
github uclnlp / fever / doc_ir.py View on Github external
def doc_ir(data=list(),edocs=edict(),best=5,model=None):
    """
    Returns a dictionary of n best document titles for each claim.
    """
    rdocs=dict()
    for example in tqdm(data):
        claim=example["claim"]
        titles=find_titles_in_claim(claim,edocs)
        ctoks=word_tokenize(claim.lower())
        rdocs[example["id"]]=(titles,ctoks)
    t2tf=titles_to_tf()
    doctf=load_doc_tf(rdocs,t2tf)
    docs=dict()
    for example in tqdm(data):
        titles,ctoks=rdocs[example["id"]]
        tscores=best_titles(example["claim"],ctoks,titles,doctf,best,model)
        docs[example["id"]]=tscores
    return docs
github openlegaldata / oldp / oldp / apps / nlp / base.py View on Github external
def process(self, text: str) -> DocBase:
        tokens = nltk.word_tokenize(text)
        return ArrayDoc(text, tokens)
github vprusso / youtube_tutorials / natural_language_processing / nlp_2.py View on Github external
# Note that the gutenberg fileids only have a small subset of text compared
# to the large amount of content found on Project Gutenberg.  

# If you wish to process a text from Project Gutenberg accessed via the web, 
# one may use the urllib module to import via the internet. 
from urllib.request import urlopen 

# This URL corresponds to "The Picture of Dorian Grey" by Oscar Wilde.
url = "https://www.gutenberg.org/cache/epub/174/pg174.txt" 
raw = urlopen(url).read().decode('utf-8')

# Once the raw content has been extracted, we convert this content to something 
# that NLTK can understand and process. This should look somewhat familiar if 
# you have consulted Part 1 of this tutorial. 
dorian_grey = nltk.Text(nltk.word_tokenize(raw))

# Once the text has been converted to an NLTK Text object, we can process it 
# just like we have been doing previously. For example, here we convert the 
# text object to a frequency distribution and calculate the hapaxes. 
fdist_dorian = nltk.FreqDist(dorian_grey)
print(fdist_dorian.hapaxes())

# The above approach is not limited to text from Project Gutenberg, but is 
# broadly applicable to any text that can be obtained from a direct URL.

# Let us consider another text resource that NLTK allows us to process. One of 
# them is various web and chat data. The first one we shall focus on his 
# web text. 

# We can print out the file ids of the webtext collection to see what is provided:
for file_id in nltk.corpus.webtext.fileids():
github RTXteam / RTX / code / reasoningtool / QuestionAnswering / QuestionTranslator.py View on Github external
# Try to pattern match to one of the known queries
		(corpus_index, similarity) = wd.find_corpus(question, Q_corpora)

		if similarity < .25:
			# Unable to match to one of the templates
			results_dict["corpus_index"] = None
			results_dict["terms"] = None
			results_dict["error_code"] = "not_understood"
			results_dict[
				"error_message"] = "Sorry, I was unable to interpret your question. The nearest similar question I can answer is:\n %s" % \
								   Q_corpora[corpus_index][wd.max_in_corpus(question, Q_corpora[corpus_index])[0]]
			return results_dict

		# get every contiguous sub-block in the query
		blocks = []
		question_tokenized = nltk.word_tokenize(question, "english")
		for block_size in range(1, len(question_tokenized)):
			for i in range(len(question_tokenized) - block_size + 1):
				block = " ".join(question_tokenized[i:(i + block_size)])
				blocks.append(block)
		blocks = list(reversed(blocks))  # go bigger to smaller since "is_assoc_with" \subst "gene_assoc_with" after stopword deletion

		# for each block, look for the associated terms in a greedy fashion
		#######################################################################
		# Q3: What are the protein targets of naproxen?
		#######################################################################
		if corpus_index == 3:  # Q3
			# Greedy look for drug name TODO: in the future, may need to disambiguate terms like I did for other Q's
			# with candidate_node_names
			source_name = None
			target_label = None
			relationship_type = None
github chen0040 / keras-question-and-answering-web-api / keras_question_and_answering_system / library / seq2seq_v2.py View on Github external
def reply(self, paragraph, question):
        input_paragraph_seq = []
        input_question_seq = []
        input_paragraph_wid_list = []
        input_question_wid_list = []
        input_paragraph_text = paragraph.lower()
        input_question_text = question.lower()
        for word in nltk.word_tokenize(input_paragraph_text):
            if not text_utils.in_white_list(word):
                continue
            idx = 1  # default [UNK]
            if word in self.input_paragraph_word2idx:
                idx = self.input_paragraph_word2idx[word]
            input_paragraph_wid_list.append(idx)
        for word in nltk.word_tokenize(input_question_text):
            if not text_utils.in_white_list(word):
                continue
            idx = 1  # default [UNK]
            if word in self.input_question_word2idx:
                idx = self.input_question_word2idx[word]
            input_question_wid_list.append(idx)
        input_paragraph_seq.append(input_paragraph_wid_list)
        input_question_seq.append(input_question_wid_list)
        input_paragraph_seq = pad_sequences(input_paragraph_seq, self.max_encoder_paragraph_seq_length)
github deepmipt / DeepPavlov / deeppavlov / models / kbqa / entity_linking.py View on Github external
def candidate_entities_inverted_index(self, entity: str) -> List[Tuple[str]]:
        word_tokens = nltk.word_tokenize(entity)
        candidate_entities = []

        for tok in word_tokens:
            if len(tok) > 1:
                found = False
                if tok in self.inverted_index:
                    candidate_entities += self.inverted_index[tok]
                    found = True
                morph_parse_tok = self.morph.parse(tok)[0]
                lemmatized_tok = morph_parse_tok.normal_form
                if lemmatized_tok != tok and lemmatized_tok in self.inverted_index:
                    candidate_entities += self.inverted_index[lemmatized_tok]
                    found = True
                if not found:
                    words_with_levens_1 = self.searcher.search(tok, d=1)
                    for word in words_with_levens_1:
github JacobPlaster / ann-writer / Modules / NaturalLanguage.py View on Github external
def getTokenisedScentence(self, inSentence):
        return nltk.pos_tag(nltk.word_tokenize(inSentence))