How to use the spacy.util.get_lang_class function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

github explosion / spacy-transformers / examples / View on Github external
    """Test the wordpiecer on a large dataset to find misalignments. If both the
    retry and force flag are set (which is the default runtime configuration),
    this script should always pass.

    * retry: If alignment fails after cleaning and normalizing both sets of
        tokens, try again with a more aggressive strategy that strips out all
        characters that are not uppercase/lowercase letters.
    * force: If alignment still fails, run the word-piece tokenizer on the
        individual spaCy tokens, so that alignment is trivial. This should
        always work.
    cfg = {"retry_alignment": retry, "force_alignment": force}
    nlp = get_lang_class(lang)()
    wp = TransformersWordPiecer.from_pretrained(nlp.vocab, trf_name=name, **cfg)
    msg.good(f"Loaded WordPiecer for model '{name}'")
    with msg.loading("Loading IMDB data..."):
        data, _ =
    texts, _ = zip(*data)
    msg.good(f"Using {len(texts)} texts from IMDB data")"Processing texts...")
    sent_counts = 0
    for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
            doc = wp(doc)
            sent_counts += len(list(doc.sents))
        except AssertionError as e:
            if len(e.args) and isinstance(e.args[0], tuple):  # Misaligned error
                a, b = e.args[0]
github explosion / spaCy / tests / lang / View on Github external
def test_lang_initialize(lang, capfd):
    """Test that languages can be initialized."""
    nlp = get_lang_class(lang)()
    # Check for stray print statements (see #3342)
    doc = nlp("test")  # noqa: F841
    captured = capfd.readouterr()
    assert not captured.out
github explosion / spaCy / spacy / cli / converters / View on Github external
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    json_docs = []
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
                ents = record["entities"]
                ents = record["spans"]
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
            doc = nlp.make_doc(raw_text)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
        json_docs.append(docs_to_json(docs, id=i))
github explosion / spaCy / spacy / cli / View on Github external
def create_model(lang, lex_attrs, name=None):
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0
    lex_added = 0
    for attrs in lex_attrs:
        if "settings" in attrs:
        lexeme = nlp.vocab[attrs["orth"]]
        lexeme.is_oov = False
        lex_added += 1
        lex_added += 1
    if len(nlp.vocab):
        oov_prob = min(lex.prob for lex in nlp.vocab) - 1
        oov_prob = DEFAULT_OOV_PROB
github EtienneAb3d / OpenNeuroSpell / SPACY / View on Github external
def get_model_desc(nlp, model_name):
    """Get human-readable model name, language name and version."""
    lang_cls = spacy.util.get_lang_class(nlp.lang)
    lang_name = lang_cls.__name__
    model_version = nlp.meta["version"]
    return "{} - {} (v{})".format(lang_name, model_name, model_version)
github shyamupa / wikidump_preprocessing / dp / View on Github external
def get_tokenizer(lang):
    lang_cls = spacy.util.get_lang_class(lang)
    return lang_cls().Defaults.create_tokenizer()
github explosion / spaCy / spacy / cli / View on Github external
def create_nlp_from_config(lang, vectors, pipeline):
    lang_class = spacy.util.get_lang_class(lang)
    nlp = lang_class()
    if vectors is not None:
        spacy.cli.train._load_vectors(nlp, vectors)
    for name, component_cfg in pipeline.items():
        factory = component_cfg.pop("factory")
        component = nlp.create_pipe(factory, config=component_cfg)
        nlp.add_pipe(component, name=name)
    return nlp
github aatimofeev / spacy_russian_tokenizer / spacy_russian_tokenizer / View on Github external
def pipeline(merge_patterns=None, terminal_patterns=None):
    CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]'

    Language = get_lang_class('ru')
    Language.Defaults.infixes += ('ยซยป',)
    Language.Defaults.infixes += ('-',)
    Language.Defaults.infixes += ('"\/',)
    Language.Defaults.infixes += ('/',)
    Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),)

    # Token.set_extension('is_adjective', default=False, force=True)
    nlp = Language()
    russian_tokenizer = RussianTokenizer(nlp, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns)

    nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True)
    # nlp.add_pipe(match_adjective, name='match_adjective', after='detect_sentence_boundaries')
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer', after='detect_sentence_boundaries')

    for case in SPECIAL_CASES:
        nlp.tokenizer.add_special_case(case, [{'ORTH': case}])
github explosion / spaCy / spacy / cli / View on Github external
def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors):
    print("Creating model...")
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    for lexeme in nlp.vocab:
        lexeme.rank = 0

    lex_added = 0
    for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
        lexeme = nlp.vocab[word]
        lexeme.rank = i
        lexeme.prob = prob
        lexeme.is_oov = False
        # Decode as a little-endian string, so that we can do & 15 to get
        # the first 4 bits. See _parse_features.pyx
        if word in clusters:
            lexeme.cluster = int(clusters[word][::-1], 2)
            lexeme.cluster = 0
github explosion / spaCy / bin / ud / View on Github external
def load_default_model_sentencizer(lang):
    """ Load a generic spaCy model and add the sentencizer for sentence tokenization"""
    loading_start = time.time()
    lang_class = get_lang_class(lang)
    nlp = lang_class()
    loading_end = time.time()
    loading_time = loading_end - loading_start
    return nlp, loading_time, lang + "_default_" + 'sentencizer'