How to use the spacy.tokens.Token function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NLPatVCU / medaCy / medacy / pipeline_components / feature_overlayers / metamap / metamap_all_types_component.py View on Github external
"""
        Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing
        MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature
        is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a
        separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component
        was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and
        'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical
        substances respectively.
        :param doc: spaCy Doc object to run through pipeline
        :return: the same Doc object
        """
        logging.debug("Called MetaMapAllTypesOverlayer")

        # register all extensions
        if self.cuis:
            Token.set_extension('feature_cui', default="-1", force=True)  # cui feature

        if not hasattr(doc._, 'file_name'):
            metamap_json = self.metamap.map_text(str(doc))
        elif doc._.file_name is None or doc._.file_name == 'STRING_INPUT':
            metamap_json = self.metamap.map_text(str(doc))
        elif os.path.isfile(doc._.file_name):
            # Check if pre-metamapped file exists at expected location
            txt_file_path = doc._.file_name
            metamapped_path = _get_metamapped_path(txt_file_path)
            if not os.path.isfile(metamapped_path):
                warnings.warn(
                    f"No metamapped file was found for '{txt_file_path}'; attempting to run MetaMap over document (results in slower runtime); ensure MetaMap is running")
                metamap_json = self.metamap.map_text(str(doc))
            else:
                # This branch of the decision tree is reached if the file is already metamapped
                metamap_json = self.metamap.load(metamapped_path)
github NLPatVCU / medaCy / medacy / pipeline_components / units / volume_unit_component.py View on Github external
def __init__(self, spacy_pipeline):
        self.nlp = spacy_pipeline
        Token.set_extension('feature_is_volume_unit', default=False)
        self.nlp.entity.add_label('volume_unit')
        self.volume_matcher = Matcher(self.nlp.vocab)

        self.volume_matcher.add('UNIT_OF_VOLUME', None,
                                [{'LOWER': 'ml'}],
                                 [{'ORTH': 'dL'}],
                                [{'LOWER': 'cc'}],
                                [{'ORTH': 'L'}])
github Liebeck / spacy-sentiws / spacy_sentiws / __init__.py View on Github external
def __init__(self, sentiws_path):
        self.sentiws = SentiWSWrapper(sentiws_path=sentiws_path)
        Token.set_extension('sentiws', getter=self.get_sentiment, force=True)
github Roboy / ravestate / modules / ravestate_nlp / extract_triples.py View on Github external
"""
    Recursive search through the dependency tree
    looks for triple values in each of the children and calls itself with the children nodes
    """
    question_word = None
    for word in token.children:
        if word.text.lower() in QuestionWord.question_words:
            question_word = QuestionWord(word)
            word = QuestionWord(word)
            if not triple.get_object():
                triple.set_object(question_word)
        elif word.dep_ in OBJECT_SET:
            triple.set_object(word)
        if word.dep_ in SUBJECT_SET:
            triple.set_subject(word)
        if isinstance(word, Token) and word.dep_ not in RECURSION_BLACKLIST:
            triple = triple_search(triple, word)
    if not triple.get_subject() and question_word:
        triple.set_subject(question_word)
    return triple
github tokestermw / spacy_hunspell / spacy_hunspell / __init__.py View on Github external
def __init__(self, nlp, path=HUNSPELL_PROFILE):
        if path in DEFAULT_DICTIONARY_PATHS:
            default_path = DEFAULT_DICTIONARY_PATHS[path]
            dic_path, aff_path = (
                os.path.join(default_path, 'en_US.dic'),
                os.path.join(default_path, 'en_US.aff'),
            )
        else:
            assert len(path) == 2, 'Include two paths: dic_path and aff_path'
            dic_path, aff_path = path

        self.hobj = HunSpell(dic_path, aff_path)

        Token.set_extension('hunspell_spell', default=None)
        Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
github explosion / spacy-transformers / spacy_transformers / language.py View on Github external
def install_extensions():
        tok2vec_attrs = [
            ATTRS.last_hidden_state,
            ATTRS.pooler_output,
            ATTRS.all_hidden_states,
            ATTRS.all_attentions,
            ATTRS.d_last_hidden_state,
            ATTRS.d_pooler_output,
            ATTRS.d_all_hidden_states,
            ATTRS.d_all_attentions,
        ]
        for attr in tok2vec_attrs:
            Doc.set_extension(attr, default=None)
            Span.set_extension(attr, getter=get_span_tok2vec_getter(attr))
            Token.set_extension(attr, getter=get_token_tok2vec_getter(attr))
        wp_attrs = [ATTRS.alignment, ATTRS.word_pieces, ATTRS.word_pieces_]
        for attr in wp_attrs:
            Doc.set_extension(attr, default=None)
            Span.set_extension(attr, getter=get_span_wp_getter(attr))
            Token.set_extension(attr, getter=get_token_wp_getter(attr))
        Doc.set_extension(ATTRS.separator, default=None)
        Span.set_extension(
            ATTRS.separator, getter=lambda span: span.doc._.get(ATTRS.separator)
        )
        Token.set_extension(
            ATTRS.separator, getter=lambda token: token.doc._.get(ATTRS.separator)
        )
        Doc.set_extension(ATTRS.segments, getter=get_segments)
        Span.set_extension(ATTRS.segments, getter=get_segments)
        for cls in [Token, Span, Doc]:
            cls.set_extension(ATTRS.start, getter=get_wp_start)
github CogStack / MedCAT / medcat / utils / spacy_pipe.py View on Github external
def add_spell_checker(self, spell_checker):
        spacy_spell_checker = SpacySpellChecker(spell_checker=spell_checker)
        self.nlp.add_pipe(spacy_spell_checker, name='spell_checker', last=True)

        # Add custom fields needed for this usecase
        Token.set_extension('verified', default=False, force=True)
        Token.set_extension('norm', default=None, force=True)
        Token.set_extension('lower', default=None, force=True)
github NLPatVCU / medaCy / medacy / pipeline_components / annotation / gold_annotator_component.py View on Github external
def __init__(self, spacy_pipeline, labels):
        """
        :param spacy_pipeline: An exisiting spacy Language processing pipeline
        :param labels: The subset of labels from the gold annotations to restrict labeling to.
        """
        self.nlp = spacy_pipeline
        self.labels = labels
        self.failed_overlay_count = 0
        self.failed_identifying_span_count = 0
        Token.set_extension('gold_label', default="O", force=True)
github NLPatVCU / medaCy / medacy / pipeline_components / units / measurement_unit_component.py View on Github external
def __init__(self, spacy_pipeline):
        self.nlp = spacy_pipeline
        Token.set_extension('feature_is_measurement_unit', default=False)
        self.nlp.entity.add_label('measurement_unit')
        self.unit_of_measurement_matcher = Matcher(self.nlp.vocab)

        self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None,
                         [{'ENT_TYPE': 'mass_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}],
                         [{'ENT_TYPE': 'volume_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'time_unit'}],
                         [{'ENT_TYPE': 'form_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}]
                         )
github explosion / spaCy / examples / pipeline / custom_component_countries_api.py View on Github external
# This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c["name"]: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add("COUNTRIES", None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension("is_country", default=False)
        Token.set_extension("country_capital", default=False)
        Token.set_extension("country_latlng", default=False)
        Token.set_extension("country_flag", default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension("has_country", getter=self.has_country)
        Span.set_extension("has_country", getter=self.has_country)