How to use the nltk.tokenize.word_tokenize function in nltk

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pshah123 / neural-rewriter / rewriter / skipthoughts.py View on Github external
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    console.log("Loaded NLTK data")
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
github WolfNiu / AdversarialDialogue / src / basic / adversary.py View on Github external
def _build_para_dict(self):
        path = "data/ppdb-2.0-s-all"
        lines = read_lines(path)
        relations = [line.split(" ||| ")[-1] for line in lines]
        equivalent_pairs = []
        print("Preprocessing raw data...")
        for line in tqdm(lines):
            split = line.split(" ||| ")    
            if split[-1] == "Equivalence":
                equivalent_pairs.append(tuple(split[1:3]))        

        paraphrase_pairs = [line.split(" ||| ")[1:3] for line in lines]
        equivalent_pairs_ubuntu = []
        print("Extracting paraphrase pairs...")
        for pair in tqdm(equivalent_pairs):
            tokens_0 = word_tokenize(pair[0]) 
            tokens_1 = word_tokenize(pair[1])
            if not (self._contains_unknown(tokens_0) or self._contains_unknown(tokens_1)):
                equivalent_pairs_ubuntu.append(
                    (tokens_0, tokens_1))
        
        # Insert paraphrases in both directions
        print("Building dictionary...")
        self.paraphrase_dict = {}
        for (p0, p1) in tqdm(equivalent_pairs_ubuntu):
            p0 = tuple(p0)
            p1 = tuple(p1)
            try:
                self.paraphrase_dict[p0] = self.paraphrase_dict[p0] + [p1]
            except:
                self.paraphrase_dict[p0] = [p1]
github singnet / nlp-services / named-entity-recognition / services / modules / classifiers_mod.py View on Github external
def process_text(self, input_text):
        token_text = word_tokenize(str(input_text))
        return token_text
github burglarhobbit / machine-reading-comprehension / machine-reading-comprehension / snet-tensorflow / preprocess.py View on Github external
#for a in answer:
		#answer_start = int(a['answer_start'])
		
		# will have to update with some kind of span prediction using rouge-L
		answer_start = randint(0,int(len(passage_concat)*8/10))

		#add '.' here, just because NLTK is not good enough in some cases
		answer_words = word_tokenize(answer_1 + '.')
		if answer_words[-1] == '.':
			answer_words = answer_words[:-1]
		else:
			answer_words = word_tokenize(answer_1)

		#word level
		prev_context_words = word_tokenize( passage_concat[:answer_start] )
		left_context_words = word_tokenize( passage_concat[answer_start:] )
		pos_list = []
		for i in range(len(answer_words)):
			if i < len(left_context_words):
				pos_list.append(len(prev_context_words) + i)
		#assert(len(pos_list) > 0)
		if(len(pos_list) == 0):
			print(answer_words)
			print(answer)
			print(ab)
			print(question)
			assert(False)

		# sent level
		# [sent_idx, word_idx]
		for idx, sent in enumerate(passage_sent):
			if sublist_exists(answer_words, sent):
github gunthercox / chatterbot-weather / chatterbot_weather / weather_adapter.py View on Github external
def get_latitude(self, user_input):
        """
        Returns the latitude extracted from the input.
        """
        from nltk import tokenize

        for token in tokenize.word_tokenize(user_input):
            if 'latitude=' in token:
                return re.sub('latitude=', '', token)

        return ''
github variational-attention / tf-var-attention / ved_detAttn / ved_detAttn.py View on Github external
symbol.append('?')

        for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
                data_utils.get_batches(x_val, y_val, self.batch_size)):
            answer_logits = sess.run(self.inference_logits,
                                     feed_dict={self.input_data: input_batch,
                                                self.source_sentence_length: source_sent_lengths,
                                                self.keep_prob: 1.0,
                                                self.word_dropout_keep_prob: 1.0,
                                                self.z_temperature: self.z_temp})

            for k, pred in enumerate(answer_logits):
                hypotheses_val.append(
                    word_tokenize(
                        " ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol)
                references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])])

        bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val)
        self.epoch_bleu_score_val['1'].append(bleu_scores[0])
        self.epoch_bleu_score_val['2'].append(bleu_scores[1])
        self.epoch_bleu_score_val['3'].append(bleu_scores[2])
        self.epoch_bleu_score_val['4'].append(bleu_scores[3])
github huseinzol05 / Python-DevOps / basic / 1.autopep8 / malaya / topic.py View on Github external
def clearstring(string):
    string = unidecode(string)
    string = re.sub('[^A-Za-z ]+', '', string)
    string = word_tokenize(string)
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string).lower()
    return ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
github hmason / ml_class / intro_web_data / classify.py View on Github external
def get_features(self, document):
        document = re.sub('[%s]' % re.escape(string.punctuation), '', document) # removes punctuation
        document = document.lower() # make everything lowercase
        all_words = [w for w in word_tokenize(document) if len(w) > 3 and len(w) < 16]
        p = PorterStemmer()
        all_words = [p.stem(w) for w in all_words]
        all_words_freq = FreqDist(all_words)
        
        # print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
        return all_words_freq
github burglarhobbit / machine-reading-comprehension / machine-reading-comprehension / snet-tensorflow / preprocess.py View on Github external
answer_1 = answer[0].strip()
			if answer_1 == [] or answer_1 == '':
				answer_1 = answer[1].strip()
				print(True)
		else:
			answer_1 = answer[0].strip()
		passage_concat = ''
		#for pi, p in enumerate(article["paragraphs"]):
		for passage in json_line['passages']:
			passage_concat += passage['passage_text']
			
		#context = p["context"]
		#context = context.replace("''", '" ')
		#context = context.replace("``", '" ')

		passage = word_tokenize(passage_concat)
		passage_sent = sent_tokenize(passage_concat)
		passage_sent = [word_tokenize(sent) for sent in passage_sent]
		passages.append(passage) # word level paragraph
		passages_sent.append(passage_sent) # sentence_word level paragraph
		passages_original.append(passage_concat) # original paragraph
		passages_original_sent.append(passage_sent) # sentence_tokenized original paragraph
		for w in passage:
			word_counter[w] += 1

		#for qa in p["qas"]:
		question = word_tokenize(json_line["query"])
		answers = []
		answers_sent = []
		for w in question:
			word_counter[w] += 1
github duoergun0729 / nlp / code / yelp.py View on Github external
def do_keras_lstm(text,stars):

    #转换成词袋序列
    max_document_length=200

    #删除通用词
    text_cleaned=[]

    list_stopWords = list(set(stopwords.words('english')))
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    d = enchant.Dict("en_US")

    for line in text:

        # 分词
        list_words = word_tokenize(line.lower())
        # 去掉标点符号
        list_words = [word for word in list_words if word not in english_punctuations]
        # 实用wordnet删除非常见英文单词
        #list_words = [word for word in list_words if wordnet.synsets(word) ]
        list_words = [word for word in list_words if d.check(word)]
        # 过滤停止词
        filtered_words = [w for w in list_words if not w in list_stopWords]
        text_cleaned.append( " ".join(filtered_words) )


    text=text_cleaned

    #设置分词最大个数 即词袋的单词个数
    tokenizer = Tokenizer(num_words=max_features,lower=True)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)