How to use the yake.datarepresentation.composed_word function in yake

To help you get started, we’ve selected a few yake examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LIAAD / yake / yake / datarepresentation.py View on Github external
#Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)
github LIAAD / yake / yake / datarepresentation.py View on Github external
def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand
github LIAAD / yake / yake / datarepresentation.py View on Github external
block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)
github LIAAD / yake / yake / datarepresentation.py View on Github external
def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand

yake

Keyword extraction Python package

LGPL-3.0
Latest version published 4 years ago

Package Health Score

61 / 100
Full package analysis