Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def empty_spacy_doc():
spacy_lang = cache.load_spacy_lang("en")
return spacy_lang("")
def spacy_doc():
spacy_lang = cache.load_spacy_lang("en")
spacy_doc = spacy_lang(TEXT)
return spacy_doc
def spacy_lang():
spacy_lang = cache.load_spacy_lang("en")
text_stats_component = components.TextStatsComponent()
spacy_lang.add_pipe(text_stats_component, after="parser")
yield spacy_lang
# remove component after running these tests
spacy_lang.remove_pipe("textacy_text_stats")
def test_load_model(self):
for lang in ["en", "en_core_web_sm"]:
for disable in [None, ("tagger", "parser", "ner")]:
assert isinstance(
cache.load_spacy_lang(lang, disable=disable),
spacy.language.Language
)
def spacy_doc():
text = "I would have lived in peace. But my enemies brought me war."
spacy_lang = cache.load_spacy_lang("en")
spacy_doc = spacy_lang(text)
return spacy_doc
def test_load_pyphen():
for lang in ("en", "es"):
_ = cache.load_hyphenator(lang=lang)
assert True
chunk_size (int): Number of characters comprising each text chunk
(excluding the last chunk, which is probably smaller). For best
performance, value should be somewhere between 1e3 and 1e7,
depending on how much RAM you have available.
.. note:: Since chunking is done by character, chunks edges' probably
won't respect natural language segmentation, which means that every
``chunk_size`` characters, spaCy will probably get tripped up and
make weird parsing errors.
Returns:
:class:`spacy.tokens.Doc`: A single processed document, initialized from
components accumulated chunk by chunk.
"""
if isinstance(lang, compat.unicode_):
lang = cache.load_spacy_lang(lang)
elif not isinstance(lang, Language):
raise TypeError(
"`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
)
words = []
spaces = []
np_arrays = []
cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
text_len = len(text)
i = 0
# iterate over text chunks and accumulate components needed to make a doc
while i < text_len:
chunk_doc = lang(text[i : i + chunk_size])
words.extend(tok.text for tok in chunk_doc)
spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
if format == "pickle":
with open_sesame(filepath, mode="rb") as f:
for spacy_doc in compat.pickle.load(f):
yield spacy_doc
elif format == "binary":
if lang is None:
raise ValueError(
"When format='binary', a `spacy.Language` (and its associated "
"`spacy.Vocab`) is required to deserialize the binary data; "
"and these should be the same as were used when processing "
"the original docs!"
)
elif isinstance(lang, Language):
vocab = lang.vocab
elif isinstance(lang, compat.unicode_):
vocab = cache.load_spacy_lang(lang).vocab
else:
raise ValueError(
"lang = '{}' is invalid; must be a str or `spacy.Language`"
)
with open_sesame(filepath, mode="rb") as f:
unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict")
for msg in unpacker:
# NOTE: The following code has been adapted from spaCy's
# built-in ``spacy.Doc.from_bytes()``. If that functionality
# changes, the following will probably break...
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple.
def _make_spacy_doc_from_record(record, lang):
if isinstance(lang, compat.unicode_):
spacy_lang = cache.load_spacy_lang(lang)
langstr = spacy_lang.lang
elif isinstance(lang, spacy.language.Language):
spacy_lang = lang
langstr = spacy_lang.lang
elif callable(lang):
langstr = lang(record[0])
spacy_lang = cache.load_spacy_lang(langstr)
else:
raise TypeError(
"`lang` must be {}, not {}".format(
{compat.unicode_, spacy.language.Language, types.FunctionType},
type(lang),
)
)
doc = spacy_lang(record[0])
doc._.meta = record[1]
return doc
def _make_spacy_doc_from_text(text, lang):
if isinstance(lang, compat.unicode_):
spacy_lang = cache.load_spacy_lang(lang)
langstr = spacy_lang.lang
elif isinstance(lang, spacy.language.Language):
spacy_lang = lang
langstr = spacy_lang.lang
elif callable(lang):
langstr = lang(text)
spacy_lang = cache.load_spacy_lang(langstr)
else:
raise TypeError(
"`lang` must be {}, not {}".format(
{compat.unicode_, spacy.language.Language, types.FunctionType},
type(lang),
)
)
return spacy_lang(text)