Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def nlp():
return Language()
class AnalysisType(Enum):
DEEP = 'deep'
MORPHOLOGICAL = 'morphological'
MULTILINGUAL = 'multilingual'
Words = Dict[str, Word]
AnalysesTypes = FrozenSet[AnalysisType]
Language = Optional[str]
ProfaneWordDictionary = 'OrderedSet[str]'
ProfaneWordDictionaryAcceptable = Collection[str]
ProfaneWordDictionaries = Dict[Language, ProfaneWordDictionary]
ProfaneWordDictionariesAcceptable = Optional[Dict[Language, ProfaneWordDictionaryAcceptable]]
Languages = 'OrderedSet[Language]'
LanguagesAcceptable = Collection[Language]
Nlps = Dict[Language, spacy.language.Language]
Morphs = Dict[Language, 'MorphAnalyzer']
Spells = Dict[Language, 'HunSpell']
Substrings = Generator[Tuple[str, int, int], Tuple[int, int], None]
TextSplittedByLanguage = List[Tuple[Language, str]]
PathOrStr = Union[Path, str]
class PersianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
lex_attr_getters[LANG] = lambda text: "fa"
tokenizer_exceptions = update_exc(TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
tag_map = TAG_MAP
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Persian(Language):
lang = "fa"
Defaults = PersianDefaults
__all__ = ["Persian"]
depending on how much RAM you have available.
.. note:: Since chunking is done by character, chunks edges' probably
won't respect natural language segmentation, which means that every
``chunk_size`` characters, spaCy will probably get tripped up and
make weird parsing errors.
Returns:
:class:`spacy.tokens.Doc`: A single processed document, initialized from
components accumulated chunk by chunk.
"""
if isinstance(lang, compat.unicode_):
lang = cache.load_spacy_lang(lang)
elif not isinstance(lang, Language):
raise TypeError(
"`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
)
words = []
spaces = []
np_arrays = []
cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
text_len = len(text)
i = 0
# iterate over text chunks and accumulate components needed to make a doc
while i < text_len:
chunk_doc = lang(text[i : i + chunk_size])
words.extend(tok.text for tok in chunk_doc)
spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
np_arrays.append(chunk_doc.to_array(cols))
i += chunk_size
# now, initialize the doc from words and spaces
class ArabicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "ar"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES
writing_system = {"direction": "rtl", "has_case": False, "has_letters": True}
class Arabic(Language):
lang = "ar"
Defaults = ArabicDefaults
__all__ = ["Arabic"]
from .stop_words import STOP_WORDS
from ...language import Language
from ...attrs import LANG
class IcelandicDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: "is"
stop_words = STOP_WORDS
class Icelandic(Language):
lang = "is"
Defaults = IcelandicDefaults
__all__ = ["Icelandic"]
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Serbian(Language):
lang = "sr"
Defaults = SerbianDefaults
lex_attr_getters[LANG] = lambda _text: "ja"
stop_words = STOP_WORDS
tag_map = TAG_MAP
syntax_iterators = SYNTAX_ITERATORS # TODO not works for spaCy 2.0.12, see work around in JapaneseCorrector
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
@classmethod
def create_tokenizer(cls, nlp=None):
return SudachiTokenizer(nlp)
@classmethod
def create_lemmatizer(cls, nlp=None):
return None
class Japanese(Language):
lang = "ja"
Defaults = JapaneseDefaults
Tokenizer = SudachiTokenizer
def make_doc(self, text):
return self.tokenizer(text)
def pickle_japanese(instance):
return Japanese, tuple()
copy_reg.pickle(Japanese, pickle_japanese)
__all__ = [