How to use nltk - 10 common examples

To help you get started, we’ve selected a few nltk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github uhh-lt / path2vec / wsd / graph_wsd_test_v2.py View on Github external
def convert_to_wordnet_pos(senseval_pos):
    if senseval_pos == 'VERB':
        return wn.VERB
    elif senseval_pos == 'NOUN':
        return wn.NOUN
    elif senseval_pos == 'ADV':
        return wn.ADV
    elif senseval_pos == 'ADJ':
        return wn.ADJ
    else:
        return None
github relwell / corenlp-xml-lib / test / test_document.py View on Github external
def test_subtrees_for_phrase(self):
        t = self._sentence.subtrees_for_phrase("NP")[0]
        self.assertIsInstance(t, Tree)
        self.assertEquals("property", t[-1].leaves()[0])
github hoaxanalyzer / hoax-search-vote / extractor / preprocessor.py View on Github external
def preprocess(text):
    text = text.encode('utf-8').decode("ascii", "replace").replace(u"\ufffd", "_").replace("___", "'").replace("'s", " ").replace("``", " ").replace("''", " ").replace("_", " ").replace("'"," ").replace("`"," ")
    text = re.sub("[^0-9a-zA-Z !\"/:;<=>?.,!@#$%^&-_|()']+", " ", text)
    tokens = text.split(" ")
    result = ""
    for token in tokens:
        word = token.split(" ")[0]
        if word not in stopwords.words('english') and token not in punctuations and token not in hoax_stopwords:
            if len(word) > 0:
                if word.isupper() and dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                elif word.isupper():
                    result += token.title() + " "
                elif dictionary.check(word.lower()):
                    new_token = lemmatizer.lemmatize(token.lower())
                    if new_token == token.lower():
                        new_token = lemmatizer.lemmatize(token.lower(), pos='v')
                    result += new_token + " "
                else:
                    result += token + " "
            else:
github hoaxanalyzer / hoax-search-vote / backup / newtfidfold.py View on Github external
def create_dic(self, documents):	
		texts = [[word for word in document.lower().split() if word not in stopwords.words('english')]
		         for document in documents]

		from collections import defaultdict
		frequency = defaultdict(int)
		for text in texts:
		    for token in text:
		        frequency[token] += 1
		texts = [[token for token in text if frequency[token] > 1]
		         for text in texts]

		dictionary = corpora.Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		return [dictionary, corpus]
github ParasAvkirkar / -Competitive-Coding-Problem-Classifier-and-Recommender / Data Extraction / codechef / problems.py View on Github external
def create_word_features(self, words):
		# print words
		w = []
		for line in words:
			for wrd in line.split():
				w.append(wrd)
		useful_words = [word for word in w if word not in
                        stopwords.words('english')]
		my_dict = ' '.join([word for word in useful_words])
		# print my_dict
		return my_dict
github HsuWanTing / unified-summarization / old_code / sentence-selector-pg / cluster.py View on Github external
def cluster_texts(texts, clusters=3):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords.words('english'),
                                 max_df=1.0,
                                 min_df=1,
                                 lowercase=True)

    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters, n_init=100, verbose=0, tol=1e-10)
    km_model.fit(tfidf_model)
    #print 'inertia: ', km_model.inertia_
    #pdb.set_trace()
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering
github chenyangh / DialogueGenerationWithEmotion / emotion_tagger_keras.py View on Github external
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
github pratik008 / HealthCare_Twitter_Analysis / Archive 1 / consolidated_scripts / cleanup_scripts / textProc.py View on Github external
def isStopword(text):
    filtered_words = []
    splitsent = text.split(' ')
    for w in splitsent:
        if w in  stopwords.words('english'):
            return 'Y'
        else:
            return 'N'
github pshah123 / neural-rewriter / rewriter / skipthoughts.py View on Github external
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    console.log("Loaded NLTK data")
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X
github WolfNiu / AdversarialDialogue / src / basic / adversary.py View on Github external
def _build_para_dict(self):
        path = "data/ppdb-2.0-s-all"
        lines = read_lines(path)
        relations = [line.split(" ||| ")[-1] for line in lines]
        equivalent_pairs = []
        print("Preprocessing raw data...")
        for line in tqdm(lines):
            split = line.split(" ||| ")    
            if split[-1] == "Equivalence":
                equivalent_pairs.append(tuple(split[1:3]))        

        paraphrase_pairs = [line.split(" ||| ")[1:3] for line in lines]
        equivalent_pairs_ubuntu = []
        print("Extracting paraphrase pairs...")
        for pair in tqdm(equivalent_pairs):
            tokens_0 = word_tokenize(pair[0]) 
            tokens_1 = word_tokenize(pair[1])
            if not (self._contains_unknown(tokens_0) or self._contains_unknown(tokens_1)):
                equivalent_pairs_ubuntu.append(
                    (tokens_0, tokens_1))
        
        # Insert paraphrases in both directions
        print("Building dictionary...")
        self.paraphrase_dict = {}
        for (p0, p1) in tqdm(equivalent_pairs_ubuntu):
            p0 = tuple(p0)
            p1 = tuple(p1)
            try:
                self.paraphrase_dict[p0] = self.paraphrase_dict[p0] + [p1]
            except:
                self.paraphrase_dict[p0] = [p1]