How to use the textacy.cache function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_keyterms.py View on Github external
def empty_spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    return spacy_lang("")
github chartbeat-labs / textacy / tests / test_io.py View on Github external
def spacy_doc():
    spacy_lang = cache.load_spacy_lang("en")
    spacy_doc = spacy_lang(TEXT)
    return spacy_doc
github chartbeat-labs / textacy / tests / spacier / test_components.py View on Github external
def spacy_lang():
    spacy_lang = cache.load_spacy_lang("en")
    text_stats_component = components.TextStatsComponent()
    spacy_lang.add_pipe(text_stats_component, after="parser")

    yield spacy_lang

    # remove component after running these tests
    spacy_lang.remove_pipe("textacy_text_stats")
github chartbeat-labs / textacy / tests / test_cache.py View on Github external
def test_load_model(self):
        for lang in ["en", "en_core_web_sm"]:
            for disable in [None, ("tagger", "parser", "ner")]:
                assert isinstance(
                    cache.load_spacy_lang(lang, disable=disable),
                    spacy.language.Language
                )
github chartbeat-labs / textacy / tests / test_export.py View on Github external
def spacy_doc():
    text = "I would have lived in peace. But my enemies brought me war."
    spacy_lang = cache.load_spacy_lang("en")
    spacy_doc = spacy_lang(text)
    return spacy_doc
github chartbeat-labs / textacy / tests / test_cache.py View on Github external
def test_load_pyphen():
    for lang in ("en", "es"):
        _ = cache.load_hyphenator(lang=lang)
        assert True
github chartbeat-labs / textacy / textacy / spacier / utils.py View on Github external
chunk_size (int): Number of characters comprising each text chunk
            (excluding the last chunk, which is probably smaller). For best
            performance, value should be somewhere between 1e3 and 1e7,
            depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy will probably get tripped up and
               make weird parsing errors.

    Returns:
        :class:`spacy.tokens.Doc`: A single processed document, initialized from
        components accumulated chunk by chunk.
    """
    if isinstance(lang, compat.unicode_):
        lang = cache.load_spacy_lang(lang)
    elif not isinstance(lang, Language):
        raise TypeError(
            "`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
        )

    words = []
    spaces = []
    np_arrays = []
    cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
    text_len = len(text)
    i = 0
    # iterate over text chunks and accumulate components needed to make a doc
    while i < text_len:
        chunk_doc = lang(text[i : i + chunk_size])
        words.extend(tok.text for tok in chunk_doc)
        spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
github chartbeat-labs / textacy / textacy / io / spacy.py View on Github external
if format == "pickle":
        with open_sesame(filepath, mode="rb") as f:
            for spacy_doc in compat.pickle.load(f):
                yield spacy_doc
    elif format == "binary":
        if lang is None:
            raise ValueError(
                "When format='binary', a `spacy.Language` (and its associated "
                "`spacy.Vocab`) is required to deserialize the binary data; "
                "and these should be the same as were used when processing "
                "the original docs!"
            )
        elif isinstance(lang, Language):
            vocab = lang.vocab
        elif isinstance(lang, compat.unicode_):
            vocab = cache.load_spacy_lang(lang).vocab
        else:
            raise ValueError(
                "lang = '{}' is invalid; must be a str or `spacy.Language`"
            )
        with open_sesame(filepath, mode="rb") as f:
            unpacker = msgpack.Unpacker(f, raw=False, unicode_errors="strict")
            for msg in unpacker:

                # NOTE: The following code has been adapted from spaCy's
                # built-in ``spacy.Doc.from_bytes()``. If that functionality
                # changes, the following will probably break...

                # Msgpack doesn't distinguish between lists and tuples, which is
                # vexing for user data. As a best guess, we *know* that within
                # keys, we must have tuples. In values we just have to hope
                # users don't mind getting a list instead of a tuple.
github chartbeat-labs / textacy / textacy / doc.py View on Github external
def _make_spacy_doc_from_record(record, lang):
    if isinstance(lang, compat.unicode_):
        spacy_lang = cache.load_spacy_lang(lang)
        langstr = spacy_lang.lang
    elif isinstance(lang, spacy.language.Language):
        spacy_lang = lang
        langstr = spacy_lang.lang
    elif callable(lang):
        langstr = lang(record[0])
        spacy_lang = cache.load_spacy_lang(langstr)
    else:
        raise TypeError(
            "`lang` must be {}, not {}".format(
                {compat.unicode_, spacy.language.Language, types.FunctionType},
                type(lang),
            )
        )
    doc = spacy_lang(record[0])
    doc._.meta = record[1]
    return doc
github chartbeat-labs / textacy / textacy / doc.py View on Github external
def _make_spacy_doc_from_text(text, lang):
    if isinstance(lang, compat.unicode_):
        spacy_lang = cache.load_spacy_lang(lang)
        langstr = spacy_lang.lang
    elif isinstance(lang, spacy.language.Language):
        spacy_lang = lang
        langstr = spacy_lang.lang
    elif callable(lang):
        langstr = lang(text)
        spacy_lang = cache.load_spacy_lang(langstr)
    else:
        raise TypeError(
            "`lang` must be {}, not {}".format(
                {compat.unicode_, spacy.language.Language, types.FunctionType},
                type(lang),
            )
        )
    return spacy_lang(text)