How to use spacy - 10 common examples

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JulianGerhard21 / bert_spacy_rasa / bert_finetuner_splitset.py View on Github external
optimizer.L2 = 0.0
    learn_rates = cyclic_triangular_rate(
        learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )

    pbar = tqdm.tqdm(total=100, leave=False)
    results = []
    epoch = 0
    step = 0
    eval_every = 100
    patience = 3
    while True:
        # Train and evaluate
        losses = Counter()
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_size)
        for batch in batches:
            optimizer.trf_lr = next(learn_rates)
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
            pbar.update(1)
            if step and (step % eval_every) == 0:
                pbar.close()
                with nlp.use_params(optimizer.averages):
                    scores = evaluate_multiclass(nlp, eval_texts, eval_cats)
                results.append((scores["textcat_acc"], step, epoch))
                print(
                    "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                        losses["trf_textcat"],
                        scores["textcat_acc"],
                        scores["textcat_cor"],
                        scores["textcat_wrg"],
github explosion / spaCy / tests / doc / test_retokenize_split.py View on Github external
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
github burglarhobbit / machine-reading-comprehension / S-NET / 3_snet_without_pr_test / evaluate-v1.1.py View on Github external
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
import spacy

nlp = spacy.blank("en")

def word_tokenize(sent):
	doc = nlp(sent)
	return [token.text for token in doc]

def normalize_answer(s):

	def remove_articles(text):
		return re.sub(r'\b(a|an|the)\b', ' ', text)

	def white_space_fix(text):
		return ' '.join(text.split())

	def remove_punc(text):
		exclude = set(string.punctuation)
		return ''.join(ch for ch in text if ch not in exclude)
github explosion / spaCy / tests / doc / test_retokenize_split.py View on Github external
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
    Token.set_extension("x", default=False, force=True)
    Token.set_extension("a", getter=lambda x: x, force=True)
    Token.set_extension("b", method=lambda x: x, force=True)
    doc = Doc(en_vocab, words=["LosAngeles", "start"])
    attrs = {"_": underscore_attrs}
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            heads = [(doc[0], 1), doc[1]]
            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
github explosion / spaCy / tests / matcher / test_matcher_api.py View on Github external
text = "Wow ๐Ÿ˜€ This is really cool! ๐Ÿ˜‚ ๐Ÿ˜‚"
    doc = Doc(en_vocab, words=text.split(" "))
    pos_emoji = ["๐Ÿ˜€", "๐Ÿ˜ƒ", "๐Ÿ˜‚", "๐Ÿคฃ", "๐Ÿ˜Š", "๐Ÿ˜"]
    pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]

    def label_sentiment(matcher, doc, i, matches):
        match_id, start, end = matches[i]
        if doc.vocab.strings[match_id] == "HAPPY":
            doc.sentiment += 0.1
        span = doc[start:end]
        with doc.retokenize() as retokenizer:
            retokenizer.merge(span)
        token = doc[start]
        token.vocab[token.text].norm_ = "happy emoji"

    matcher = Matcher(en_vocab)
    matcher.add("HAPPY", label_sentiment, *pos_patterns)
    matcher(doc)
    assert doc.sentiment != 0
    assert doc[1].norm_ == "happy emoji"
github explosion / spaCy / tests / regression / test_issue1501-2000.py View on Github external
def test_issue1537():
    """Test that Span.as_doc() doesn't segfault."""
    string = "The sky is blue . The man is pink . The dog is purple ."
    doc = Doc(Vocab(), words=string.split())
    doc[0].sent_start = True
    for word in doc[1:]:
        if word.nbor(-1).text == ".":
            word.sent_start = True
        else:
            word.sent_start = False
    sents = list(doc.sents)
    sent0 = sents[0].as_doc()
    sent1 = sents[1].as_doc()
    assert isinstance(sent0, Doc)
    assert isinstance(sent1, Doc)
github explosion / spaCy / tests / doc / test_underscore.py View on Github external
def test_underscore_docstring(en_vocab):
    """Test that docstrings are available for extension methods, even though
    they're partials."""

    def test_method(doc, arg1=1, arg2=2):
        """I am a docstring"""
        return (arg1, arg2)

    Doc.set_extension("test_docstrings", method=test_method)
    doc = Doc(en_vocab, words=["hello", "world"])
    assert test_method.__doc__ == "I am a docstring"
    assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
github explosion / spaCy / tests / doc / test_retokenize_merge.py View on Github external
def test_doc_retokenize_merge_extension_attrs(en_vocab):
    Token.set_extension("a", default=False, force=True)
    Token.set_extension("b", default="nothing", force=True)
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    # Test regular merging
    with doc.retokenize() as retokenizer:
        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert doc[0].lemma_ == "hello world"
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    # Test bulk merging
    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
    assert doc[0]._.a is True
    assert doc[0]._.b == "1"
    assert doc[1]._.a is None
    assert doc[1]._.b == "2"
github explosion / spaCy / tests / doc / test_retokenize_merge.py View on Github external
def test_doc_retokenizer_merge_lex_attrs(en_vocab):
    """Test that retokenization also sets attributes on the lexeme if they're
    lexical attributes. For example, if a user sets IS_STOP, it should mean that
    "all tokens with that lexeme" are marked as a stop word, so the ambiguity
    here is acceptable. Also see #2390.
    """
    # Test regular merging
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    assert not any(t.is_stop for t in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"lemma": "hello world", "is_stop": True})
    assert doc[0].lemma_ == "hello world"
    assert doc[0].is_stop
    # Test bulk merging
    doc = Doc(en_vocab, words=["eins", "zwei", "!", "!"])
    assert not any(t.like_num for t in doc)
    assert not any(t.is_stop for t in doc)
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:2], attrs={"like_num": True})
        retokenizer.merge(doc[2:4], attrs={"is_stop": True})
    assert doc[0].like_num
    assert doc[1].is_stop
    assert not doc[0].is_stop
    assert not doc[1].like_num
github explosion / spaCy / tests / matcher / test_matcher_api.py View on Github external
def test_matcher_set_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
    matcher.add("DET_HOUSE", None, pattern)
    doc = Doc(en_vocab, words=["In", "a", "house"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["my", "house"])
    matches = matcher(doc)
    assert len(matches) == 1