How to use the spacy.lang.en.English function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ines / spacymoji / tests / test_emoji.py View on Github external
def test_custom_attrs():
    attrs = ('contains_emoji', 'equals_emoji', 'emoji_details', 'all_emoji')
    nlp = English()
    emoji = Emoji(nlp, attrs=attrs)
    nlp.add_pipe(emoji, last=True)
    doc = nlp(u"Hello ๐ŸŽ‰")
    assert doc._.all_emoji
    assert len(doc._.all_emoji) == 1
    assert doc[1]._.has('equals_emoji')
    assert doc[1]._.emoji_details
github explosion / spaCy / tests / regression / test_issue3001-3500.py View on Github external
def test_issue3410():
    texts = ["Hello world", "This is a test"]
    nlp = English()
    matcher = Matcher(nlp.vocab)
    phrasematcher = PhraseMatcher(nlp.vocab)
    with pytest.deprecated_call():
        docs = list(nlp.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
    with pytest.deprecated_call():
        list(matcher.pipe(docs, n_threads=4))
    with pytest.deprecated_call():
        list(phrasematcher.pipe(docs, n_threads=4))
github explosion / spaCy / tests / regression / test_issue3001-3500.py View on Github external
def test_issue3345():
    """Test case where preset entity crosses sentence boundary."""
    nlp = English()
    doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
    doc[4].is_sent_start = True
    ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
    ner = EntityRecognizer(doc.vocab)
    # Add the OUT action. I wouldn't have thought this would be necessary...
    ner.moves.add_action(5, "")
    ner.add_label("GPE")
    doc = ruler(doc)
    # Get into the state just before "New"
    state = ner.moves.init_batch([doc])[0]
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    ner.moves.apply_transition(state, "O")
    # Check that B-GPE is valid.
    assert ner.moves.is_valid(state, "B-GPE")
github facebookresearch / ELI5 / data_creation / data_utils.py View on Github external
yield
    finally:
        signal.alarm(0)

# URL match regex
URL_REGEX   = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?ยซยปโ€œโ€โ€˜โ€™])|(?:(? "),
               ("<",  " < "),
              ]

tokenizer = English().Defaults.create_tokenizer()

# tokenizes and removes URLs (kept in separate list)
def pre_word_url_tokenize(stp):
    url_list = list(set(re.findall(URL_REGEX, stp)))
    # stp = st.strip()
    for i, url in enumerate(url_list):
        stp = stp.replace(url, " URL_%d " % (i,))
    for a, b in html_pairs:
        stp = stp.replace(a, b)
    pre_txt = ' '.join([str(x) for x in tokenizer(stp)])
    return (' '.join(pre_txt.split()), url_list)


# wrap inside a timer to catch cases where SpaCy tokenizer hangs on too many dots
def word_url_tokenize(st, max_len=20480, max_cont_len=512):
    stp = ' '.join([w[:max_cont_len] if w[:max_cont_len].count('.') <= 12 else '.' for w in st.split()[:max_len]])
github explosion / spaCy / examples / pipeline / custom_component_countries_api.py View on Github external
def main():
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries)  # add it to the pipeline
    doc = nlp("Some text about Colombia and the Czech Republic")
    print("Pipeline", nlp.pipe_names)  # pipeline contains component name
    print("Doc has countries", doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(
                token.text,
                token._.country_capital,
                token._.country_latlng,
                token._.country_flag,
            )  # country data
    print("Entities", [(e.text, e.label_) for e in doc.ents])  # entities
github facebookresearch / craftassist / python / base_agent / ttad / ttad_model / processing_scripts / read_rephrased.py View on Github external
"""
Copyright (c) Facebook, Inc. and its affiliates.
"""

from copy import deepcopy

# from pprint import pprint
import csv
import json

from spacy.lang.en import English

tokenizer = English().Defaults.create_tokenizer()


def word_tokenize(st):
    return [(x.text, x.idx) for x in tokenizer(st)]


rephrases = []
for j in range(5):
    with open("rephrase_%d.csv" % (j,)) as csvfile:
        g_reader = csv.reader(csvfile)
        for i, row in enumerate(g_reader):
            if i > 0:
                rephrases += [row[-2:]]


brackets = [("(", ")"), ("{", "}"), ("[", "]"), ("*", "*"), ("$", "$"), ("#", "#")]
github spacemanidol / MSMARCO / Evaluation / ms_marco_eval.py View on Github external
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
    """Normalize and tokenize strings.

    Args:
    p_iter (iter): iter over strings to normalize and tokenize.
    p_batch_size (int): number of batches.
    p_thread_count (int): number of threads running.

    Returns:
    iter: iter over normalized and tokenized string.
    """

    global NLP
    if not NLP:
        NLP = NlpEnglish(parser=False)

    output_iter = NLP.pipe(p_iter, \
                           batch_size=p_batch_size, \
                           n_threads=p_thread_count)

    for doc in output_iter:
        tokens = [str(w).strip().lower() for w in doc]
        yield ' '.join(tokens)
github vered1986 / OKR / src / baseline_system / predicate_entailment.py View on Github external
def __init__(self, resource_file):

        # Load the resource file
        self.entailment_rules = bsddb.btopen(resource_file, 'r')

        # Set threshold to default as recommended
        self.threshold = 0.0

        self.nlp = English()