How to use the spacy.matcher.Matcher function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spaCy / tests / matcher / test_matcher_api.py View on Github external
text = "Wow ๐Ÿ˜€ This is really cool! ๐Ÿ˜‚ ๐Ÿ˜‚"
    doc = Doc(en_vocab, words=text.split(" "))
    pos_emoji = ["๐Ÿ˜€", "๐Ÿ˜ƒ", "๐Ÿ˜‚", "๐Ÿคฃ", "๐Ÿ˜Š", "๐Ÿ˜"]
    pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]

    def label_sentiment(matcher, doc, i, matches):
        match_id, start, end = matches[i]
        if doc.vocab.strings[match_id] == "HAPPY":
            doc.sentiment += 0.1
        span = doc[start:end]
        with doc.retokenize() as retokenizer:
            retokenizer.merge(span)
        token = doc[start]
        token.vocab[token.text].norm_ = "happy emoji"

    matcher = Matcher(en_vocab)
    matcher.add("HAPPY", label_sentiment, *pos_patterns)
    matcher(doc)
    assert doc.sentiment != 0
    assert doc[1].norm_ == "happy emoji"
github explosion / spaCy / tests / regression / test_issue1-1000.py View on Github external
def test_issue850_basic():
    """Test Matcher matches with '*' operator and Boolean flag"""
    vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
    matcher = Matcher(vocab)
    pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
    matcher.add("FarAway", None, pattern)
    doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
    match = matcher(doc)
    assert len(match) == 1
    ent_id, start, end = match[0]
    assert start == 0
    assert end == 4
github explosion / spaCy / tests / regression / test_issue1501-2000.py View on Github external
def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", None, [{"orth": "hello"}])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1
github explosion / spaCy / tests / matcher / test_matcher_logic.py View on Github external
def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match
github explosion / spaCy / tests / test_basic_create.py View on Github external
def test_create(self):
        vocab = Vocab()
        matcher = Matcher(vocab, {})
github crazyfrogspb / RedditScore / redditscore / tokenizer.py View on Github external
def __init__(self, lowercase=True, keepcaps=False, normalize=3,
                 ignore_quotes=False, ignore_reddit_quotes=False,
                 ignore_stopwords=False, stem=False,
                 remove_punct=True, remove_breaks=True, decontract=False,
                 twitter_handles=False, urls=False, hashtags=False,
                 numbers=False, subreddits=False, reddit_usernames=False,
                 emails=False, extra_patterns=None, keep_untokenized=None,
                 whitespaces_to_underscores=True, remove_nonunicode=False,
                 pos_emojis=None, neg_emojis=None, neutral_emojis=None,
                 print_url_warnings=False, latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        self._nlp = English()
        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add(
            'HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        self._merging_matcher.add(
            'SUBREDDIT', None,
            [{'ORTH': '/r'}, {'ORTH': '/'}, {alpha_digits_flag: True}],
github kororo / excelcy / excelcy / pipe.py View on Github external
def __init__(self, nlp, patterns: list = None):
        """
        SpaCy pipe to match Entity based on multiple patterns.

        Pattern examples:
        patterns = [
            {'kind': 'phrase', 'value': 'amazon', 'entity': 'PRODUCT'},
            {'kind': 'regex', 'value': 'ama(.+)', 'entity': 'PRODUCT'}
        ]

        :param nlp: The NLP object
        :param patterns: The matcher patterns
        """
        self.nlp = nlp
        self.phrase_matcher = PhraseMatcher(nlp.vocab)
        self.matcher = Matcher(nlp.vocab)

        self.extra_patterns = []
        # start add pattern
        self.add_patterns(patterns=patterns or [])
github aatimofeev / spacy_russian_tokenizer / spacy_russian_tokenizer / pipeline.py View on Github external
#     # print(doc)
                #     # error occurs when there are more than one hyphen within span, basically it can be ignored
                span.merge()
        return doc

    CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]'
    r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER)

    Language = get_lang_class('ru')
    Language.Defaults.infixes += ('ยซยป',)
    Language.Defaults.infixes += ('-',)
    Language.Defaults.infixes += ('"\/',)
    Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),)
    # Token.set_extension('is_adjective', default=False, force=True)
    nlp = Language()
    matcher = Matcher(nlp.vocab)
    pattern = nlp.vocab.strings['pattern']
    sentence_terminal = nlp.vocab.strings['sentence_terminal']
    if merge_patterns:
        matcher.add(pattern, None, *merge_patterns)
    if terminal_patterns:
        matcher.add(sentence_terminal, None, *terminal_patterns)
    # nlp.add_pipe(match_adjective, name='match_adjective', last=True)
    nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True)
    nlp.add_pipe(rules_matcher, name='rules_matcher', after='detect_sentence_boundaries')

    for case in HYPHEN_SPICIAL_CASES:
        nlp.tokenizer.add_special_case(case, [{'ORTH': case}])

    for case in DOT_SPECIAL_CASES:
        nlp.tokenizer.add_special_case(case, [{'ORTH': case}])
github crazyfrogspb / RedditScore / redditscore / tokenizer.py View on Github external
def __init__(self, lowercase=True, keepcaps=False, normalize=3,
                 ignore_quotes=False, ignore_reddit_quotes=False,
                 ignore_stopwords=False, stem=False,
                 remove_punct=True, remove_breaks=True, decontract=False,
                 twitter_handles=False, urls=False, hashtags=False,
                 numbers=False, subreddits=False, reddit_usernames=False,
                 emails=False, extra_patterns=None, keep_untokenized=None,
                 whitespaces_to_underscores=True, remove_nonunicode=False,
                 pos_emojis=None, neg_emojis=None, neutral_emojis=None,
                 print_url_warnings=False, latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        self._nlp = English()
        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add(
            'HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
        self._merging_matcher.add(
            'SUBREDDIT', None,
            [{'ORTH': '/r'}, {'ORTH': '/'}, {alpha_digits_flag: True}],
            [{'ORTH': 'r'}, {'ORTH': '/'}, {alpha_digits_flag: True}])
github ICLRandD / Blackstone / pubref_matcher.py View on Github external
from spacy.matcher import Matcher
from spacy.lang.en import English

TEXTS = [
    "The name of the case is R v Horncastle [2009] AC 123",
    "The name of the case is R v Horncastle [2009] 1 AC 123",
    "The name of the case is R v Horncastle [2009] 1 Cr App R 109",
    "The name of the case was Boaty McBoatface [2009] EWCA Civ 123",
    "The name of the case was Boaty McBoatface [2009] 1 All ER 123",
    "The name of the case was Boaty McBoatface [2009] EWHC 123 (Admin) and we like hats.",
    "I shouldn't return any matched entities.",
]

nlp = English()
matcher = Matcher(nlp.vocab)

# Matches [2010] AC 123-style
pattern1 = [
    {"IS_PUNCT": True},
    {"LIKE_NUM": True},
    {"IS_PUNCT": True},
    {"IS_ALPHA": True},
    {"LIKE_NUM": True},
]

# Matches [2010] 1 AC 123-style
pattern2 = [
    {"IS_PUNCT": True},
    {"LIKE_NUM": True},
    {"IS_PUNCT": True},
    {"LIKE_NUM": True},