How to use the textblob.packages.nltk function in textblob

To help you get started, we’ve selected a few textblob examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sloria / textblob-aptagger / textblob_aptagger / taggers.py View on Github external
def tag(self, corpus, tokenize=True):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
        w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w) for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
github sloria / textblob-aptagger / textblob_aptagger / taggers.py View on Github external
def tag(self, corpus, tokenize=True):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = nltk.sent_tokenize if tokenize else lambda t: t.split('\n')
        w_split = nltk.word_tokenize if tokenize else lambda s: s.split()
        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w) for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
github sloria / TextBlob / download_corpora_lite.py View on Github external
def main():
    for each in REQUIRED_CORPORA:
        print('Downloading "{0}"'.format(each))
        nltk.download(each)
    print("Finished.")
github sloria / TextBlob / textnp_extractors.py View on Github external
def extract(self, text):
        '''Return a list of noun phrases (strings) for body of text.'''
        sentences = nltk.tokenize.sent_tokenize(text)
        noun_phrases = []
        for sentence in sentences:
            parsed = self._parse_sentence(sentence)
            # Get the string representation of each subtree that is a
            # noun phrase tree
            phrases = [_normalize_tags(filter_insignificant(each,
                       self.INSIGNIFICANT_SUFFIXES)) for each in parsed
                       if isinstance(each, nltk.tree.Tree) and each.label()
                       == 'NP' and len(filter_insignificant(each)) >= 1
                       and _is_match(each, cfg=self.CFG)]
            nps = [tree2str(phrase) for phrase in phrases]
            noun_phrases.extend(nps)
        return noun_phrases
github sloria / TextBlob / textnp_extractors.py View on Github external
def extract(self, text):
        '''Return a list of noun phrases (strings) for body of text.'''
        sentences = nltk.tokenize.sent_tokenize(text)
        noun_phrases = []
        for sentence in sentences:
            parsed = self._parse_sentence(sentence)
            # Get the string representation of each subtree that is a
            # noun phrase tree
            phrases = [_normalize_tags(filter_insignificant(each,
                       self.INSIGNIFICANT_SUFFIXES)) for each in parsed
                       if isinstance(each, nltk.tree.Tree) and each.label()
                       == 'NP' and len(filter_insignificant(each)) >= 1
                       and _is_match(each, cfg=self.CFG)]
            nps = [tree2str(phrase) for phrase in phrases]
            noun_phrases.extend(nps)
        return noun_phrases
github sloria / TextBlob / download_corpora.py View on Github external
def main():
    for each in REQUIRED_CORPORA:
        print('Downloading "{0}"'.format(each))
        nltk.download(each)
    print("Finished.")
github sloria / TextBlob / textnp_extractors.py View on Github external
    @requires_nltk_corpus
    def train(self):
        '''Train the Chunker on the ConLL-2000 corpus.'''
        train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in
                      nltk.corpus.conll2000.chunked_sents('train.txt',
                                                    chunk_types=['NP'])]
        unigram_tagger = nltk.UnigramTagger(train_data)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True
github sloria / TextBlob / textnp_extractors.py View on Github external
def parse(self, sentence):
        '''Return the parse tree for the sentence.'''
        if not self._trained:
            self.train()
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in
                     zip(sentence, chunktags)]
        return nltk.chunk.util.conlltags2tree(conlltags)
github sloria / TextBlob / textnp_extractors.py View on Github external
    @requires_nltk_corpus
    def train(self):
        train_data = nltk.corpus.brown.tagged_sents(categories='news')
        regexp_tagger = nltk.RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
            (r'(-|:|;)$', ':'),
            (r'\'*$', 'MD'),
            (r'(The|the|A|a|An|an)$', 'AT'),
            (r'.*able$', 'JJ'),
            (r'^[A-Z].*$', 'NNP'),
            (r'.*ness$', 'NN'),
            (r'.*ly$', 'RB'),
            (r'.*s$', 'NNS'),
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*', 'NN'),
            ])
        unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True
github sloria / TextBlob / textnp_extractors.py View on Github external
    @requires_nltk_corpus
    def train(self):
        '''Train the Chunker on the ConLL-2000 corpus.'''
        train_data = [[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in
                      nltk.corpus.conll2000.chunked_sents('train.txt',
                                                    chunk_types=['NP'])]
        unigram_tagger = nltk.UnigramTagger(train_data)
        self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
        self._trained = True