How to use the spacy.tokens.Token.set_extension function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spaCy / tests / doc / test_retokenize_split.py View on Github external
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
github explosion / spaCy / tests / doc / test_retokenize_merge.py View on Github external
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.merge(doc[0:2], attrs=attrs)
github explosion / spaCy / tests / matcher / test_matcher_api.py View on Github external
def test_matcher_extension_set_membership(en_vocab):
    matcher = Matcher(en_vocab)
    get_reversed = lambda token: "".join(reversed(token.text))
    Token.set_extension("reversed", getter=get_reversed, force=True)
    pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
    matcher.add("REVERSED", None, pattern)
    doc = Doc(en_vocab, words=["hi", "bye", "hello"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
github bjascob / pyInflect / pyinflect / Inflections.py View on Github external
def __init__(self, infl_fn, overrides_fn=None):
        self.infl_data = self._loadInflections(infl_fn)
        if overrides_fn:
            self.overrides = self._loadOverrides(overrides_fn)
        if 'spacy' in sys.modules:
            min_version = '2.0'
            mv = min_version.split('.')
            sv = spacy.__version__.split('.')
            if sv[0] > mv[0] or (sv[0] == mv[0] and sv[1] >= mv[1]):
                spacy.tokens.Token.set_extension('inflect', method=self.spacyGetInfl, force=True)
            else:
                logging.warning('Spacy extensions are disabled.  Spacy version is %s.  '
                                'A minimum of %s is required', spacy.__version__, min_version)
github ines / spacymoji / spacymoji / __init__.py View on Github external
to custom descriptions, e.g. translations or other annotations.
        RETURNS (callable): A spaCy pipeline component.
        """
        self._has_emoji, self._is_emoji, self._emoji_desc, self._emoji = attrs
        self.merge_spans = merge_spans
        self.lookup = lookup
        self.matcher = PhraseMatcher(nlp.vocab)
        emoji_patterns = list(nlp.tokenizer.pipe(EMOJI.keys()))
        self.matcher.add(pattern_id, None, *emoji_patterns)
        # Add attributes
        Doc.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension)
        Doc.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension)
        Span.set_extension(self._has_emoji, getter=self.has_emoji, force=force_extension)
        Span.set_extension(self._emoji, getter=self.iter_emoji, force=force_extension)
        Token.set_extension(self._is_emoji, default=False, force=force_extension)
        Token.set_extension(self._emoji_desc, getter=self.get_emoji_desc, force=force_extension)
github rominf / profanity-filter / profanity_filter / spacy_component.py View on Github external
def do() -> None:
            Token.set_extension('censored', default=None)
            Token.set_extension('is_profane', getter=SpacyProfanityFilterComponent.token_is_profane)
            Token.set_extension('original_profane_word', default=None)

            Span.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
            Doc.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
github crazyfrogspb / RedditScore / redditscore / tokenizer.py View on Github external
from eventlet.green.urllib.request import urlopen
from eventlet.timeout import Timeout
from redditscore.models.redditmodel import word_ngrams
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Doc, Token

try:
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer, WordNetLemmatizer
except ImportError:
    warnings.warn(
        'nltk could not be imported, some features will be unavailable')


Token.set_extension('transformed_text', default='', force=True)
Doc.set_extension('tokens', default='', force=True)

TIMEOUT = 3.0


POS_EMOJIS = [u'๐Ÿ˜‚', u'โค', u'โ™ฅ', u'๐Ÿ˜', u'๐Ÿ˜˜', u'๐Ÿ˜Š', u'๐Ÿ‘Œ', u'๐Ÿ’•',
              u'๐Ÿ‘', u'๐Ÿ˜', u'โ˜บ', u'โ™ก', u'๐Ÿ‘', u'โœŒ', u'๐Ÿ˜', u'๐Ÿ˜‰', u'๐Ÿ™Œ', u'๐Ÿ˜„']
NEG_EMOJIS = [u'๐Ÿ˜ญ', u'๐Ÿ˜ฉ', u'๐Ÿ˜’', u'๐Ÿ˜”', u'๐Ÿ˜ฑ']
NEUTRAL_EMOJIS = [u'๐Ÿ™']

NORMALIZE_RE = re.compile(r"([a-zA-Z])\1\1+")
ALPHA_DIGITS_RE = re.compile(r"[a-zA-Z0-9_]+")
TWITTER_HANDLES_RE = re.compile(r"@\w{1,15}")
REDDITORS_RE = re.compile(r"u/\w{1,20}")
SUBREDDITS_RE = re.compile(r"/r/\w{1,20}")
QUOTES_RE = re.compile(r'^".*"$')
github CogStack / MedCAT / preprocessing / spacy_pipe.py View on Github external
def add_punct_tagger(self, tagger):
        """ Tagging for punct
        """
        self.nlp.add_pipe(tagger, name='tag_punct', first=True)
        # Add custom fields needed for this usecase
        Token.set_extension('is_punct', default=False, force=True)
        Token.set_extension('to_skip', default=False, force=True)