How to use the spacy.tokens.Span function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_default(self, spacy_doc):
        result = list(extract.noun_chunks(spacy_doc))
        assert all(isinstance(span, Span) for span in result)
github chartbeat-labs / textacy / tests / test_readme.py View on Github external
for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2)
    )[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False, exclude_types="numeric")
    )[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    pos_regex_matches = list(
        extract.pos_regex_matches(doc, constants.POS_REGEX_PATTERNS["en"]["NP"])
    )[:10]
    for match in pos_regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = textacy.ke.textrank(doc, topn=10)
github mpuig / spacy-lookup / spacy_lookup / __init__.py View on Github external
"""Initialise the pipeline component.
        """
        self._has_entities, self._is_entity, self._entity_desc, self._entities, self.canonical = attrs

        # Set up the KeywordProcessor
        self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label

        # Register attribute on the Doc and Span
        Doc.set_extension(self._has_entities, getter=self.has_entities, force=True)
        Doc.set_extension(self._entities, getter=self.iter_entities, force=True)
        Span.set_extension(self._has_entities, getter=self.has_entities, force=True)
        Span.set_extension(self._entities, getter=self.iter_entities, force=True)

        # Register attribute on the Token.
        Token.set_extension(self._is_entity, default=False, force=True)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc, force=True)
        Token.set_extension(self.canonical, default=None, force=True)
github rominf / profanity-filter / profanity_filter / spacy_component.py View on Github external
def do() -> None:
            Token.set_extension('censored', default=None)
            Token.set_extension('is_profane', getter=SpacyProfanityFilterComponent.token_is_profane)
            Token.set_extension('original_profane_word', default=None)

            Span.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
            Doc.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
github CogStack / MedCAT / medcat / spacy_cat.py View on Github external
cui,
                            self.cdb.cui2pretty_name.get(cui, ''),
                            self.cdb.cui2tui.get(cui, ''),
                            self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''), ''),
                            float(acc))
                elif self.LBL_STYLE == 'ent':
                    lbl = "{} - {:.2}".format(self.cdb.tui2name.get(
                        self.cdb.cui2tui.get(cui, ''), ''),
                        float(acc))
                elif self.LBL_STYLE == 'none':
                    lbl = ""
                else:
                    lbl = cui

                lbl = doc.vocab.strings.add(lbl)
                ent = Span(doc, tkns[0].i, tkns[-1].i + 1, label=lbl)


                if self.ACC_ALWAYS:
                    acc = self._calc_acc(cui, doc, tkns, name)

                ent._.acc = acc
                ent._.cui = cui
                ent._.tui = self.cdb.cui2tui.get(cui, 'None')
                ent._.id = self.ent_id
                self.ent_id += 1
                doc._.ents.append(ent)

                # Increase counter for cui_count_ext if not already added
                if cui not in self._cuis:
                    if cui in self.cdb.cui_count_ext:
                        self.cdb.cui_count_ext[cui] += 1
github ICLRandD / Blackstone / blackstone / pipeline / abbreviations.py View on Github external
def find_matches_for(
        self, filtered: List[Tuple[Span, Span]], doc: Doc
    ) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.string not in already_seen_long if long else False
            new_short = short.string not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.string)
                already_seen_short.add(short.string)
                all_occurences[long].add(short)
github opentargets / data_pipeline / modules / LiteratureNLP.py View on Github external
:return: 
        '''
        allowed_pos = [NOUN, ADJ, PUNCT, PROPN]
        allowed_dep = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl", "dobj",  "attr", "oprd", "pobj", "conj",
                       "compound", "amod", "punct", "meta", "npadvmod", "nmod"]#, add "prep" to extend for "of and "in"
        extended_tokens = [i for i in tok.subtree if (i.dep_ in allowed_dep and i in tok.children) or (i == tok)]
        allowed_continous_tokens = []
        #break the extened token if something not allowed is between the selected tokens in the subtree
        curr_pos = extended_tokens[0].i -1
        for ex_t in extended_tokens:
            if ex_t.i == curr_pos+1:
                curr_pos = ex_t.i
                allowed_continous_tokens.append(ex_t)
            else:
                break
        span= Span(self.doc, allowed_continous_tokens[0].i, allowed_continous_tokens[-1].i+1)
        return span
github NLPatVCU / medaCy / medacy / pipeline_components / lexicon / lexicon_component.py View on Github external
in the doc that match the lexicon and overlays the appropriate label as 'feature_is_label_from_lexicon'
        over all tokens in the span.
        :param doc:
        :return:
        """
        logging.debug("Called Lexicon Component")

        matcher = PhraseMatcher(self.nlp.vocab, max_length=10)
        for label in self.lexicon:
            Token.set_extension('feature_is_' + label + '_from_lexicon', default=False, force=True)
            patterns = [self.nlp.make_doc(term) for term in self.lexicon[label]]
            logging.debug(patterns)
            matcher.add(label, None, *patterns)
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end)
            logging.debug(span)
            if span is not None:
                logging.debug('Lexicon term matched: %s Label: %s' % (span.text, self.nlp.vocab.strings[match_id]))
                for token in span:
                    token._.set('feature_is_' + self.nlp.vocab.strings[match_id] + '_from_lexicon', True)

        return doc
github NLPatVCU / medaCy / medacy / pipeline_components / units / frequency_unit_component.py View on Github external
def __call__(self, doc):
        nlp = self.nlp
        with doc.retokenize() as retokenizer:
            # match and frequency indicators
            matches = self.frequency_matcher(doc)
            for match_id, start, end in matches:
                span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator'])
                for token in span:
                    token._.feature_is_frequency_indicator = True
                if len(span) > 1:
                    retokenizer.merge(span)
                doc.ents = list(doc.ents) + [span]
        return doc
github CogStack / MedCAT / medcat / utils / spacy_pipe.py View on Github external
def add_cat(self, spacy_cat):
        self.nlp.add_pipe(spacy_cat, name='cat', last=True)

        # Add custom fields needed for this usecase
        Doc.set_extension('ents', default=None, force=True)
        Span.set_extension('acc', default=-1, force=True)
        Span.set_extension('cui', default=-1, force=True)
        Span.set_extension('tui', default=-1, force=True)
        Span.set_extension('id', default=0, force=True)