How to use textacy - 10 common examples

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / keyterms.py View on Github external
)
        n_keyterms = int(round(n_toks * n_keyterms))
    if window_width < 2:
        raise ValueError("`window_width` must be >= 2")
    window_width = min(n_toks, window_width)
    min_term_freq = min(n_toks // 1000, 4)
    if isinstance(ngrams, int):
        ngrams = (ngrams,)

    # build full list of candidate terms
    # if inverse doc freqs available, include nouns, adjectives, and verbs;
    # otherwise, just include nouns and adjectives
    # (without IDF downweighting, verbs dominate the results in a bad way)
    include_pos = {"NOUN", "PROPN", "ADJ", "VERB"} if idf else {"NOUN", "PROPN", "ADJ"}
    terms = itertoolz.concat(
        extract.ngrams(
            doc,
            n,
            filter_stops=True,
            filter_punct=True,
            filter_nums=False,
            include_pos=include_pos,
            min_freq=min_term_freq,
        )
        for n in ngrams
    )

    # get normalized term strings, as desired
    # paired with positional index in document and length in a 3-tuple
    if normalize == "lemma":
        terms = [(term.lemma_, term.start, len(term)) for term in terms]
    elif normalize == "lower":
github chartbeat-labs / textacy / tests / datasets / test_imdb.py View on Github external
def test_records():
    for text, meta in DATASET.records(limit=3):
        assert isinstance(text, compat.unicode_)
        assert isinstance(meta, dict)
github chartbeat-labs / textacy / tests / datasets / test_capitol_words.py View on Github external
def test_texts():
    for text in DATASET.texts(limit=3):
        assert isinstance(text, compat.unicode_)
github chartbeat-labs / textacy / tests / test_readme.py View on Github external
def test_plaintext_functionality(text):
    preprocessed_text = preprocessing.normalize_whitespace(text)
    preprocessed_text = preprocessing.remove_punctuation(text)
    preprocessed_text = preprocessed_text.lower()
    assert all(char.islower() for char in preprocessed_text if char.isalpha())
    assert all(char.isalnum() or char.isspace() for char in preprocessed_text)
    keyword = "America"
    kwics = text_utils.keyword_in_context(
        text, keyword, window_width=35, print_only=False
    )
    for pre, kw, post in kwics:
        assert kw == keyword
        assert isinstance(pre, compat.unicode_)
        assert isinstance(post, compat.unicode_)
github chartbeat-labs / textacy / tests / spacier / test_doc_extensions.py View on Github external
def test_lang(self, doc):
        lang = doc._.lang
        assert isinstance(lang, compat.unicode_)
        assert lang == doc.vocab.lang
github chartbeat-labs / textacy / tests / test_io.py View on Github external
def test_read_write_sparse_csr_compressed(self, tmpdir):
        expected = sp.csr_matrix(
            (
                np.array([1, 2, 3, 4, 5, 6]),
                (np.array([0, 0, 1, 2, 2, 2]), np.array([0, 2, 2, 0, 1, 2])),
            ),
            shape=(3, 3),
        )
        filepath = str(tmpdir.join("test_read_write_sparse_matrix_csr_compressed.npz"))
        io.write_sparse_matrix(expected, filepath, compressed=True)
        observed = io.read_sparse_matrix(filepath, kind="csr")
        assert abs(observed - expected).nnz == 0
github chartbeat-labs / textacy / tests / datasets / test_dataset_utils.py View on Github external
def test_unpack_archive(tmpdir):
    data = "Here's some text data to pack and unpack."
    fpath_txt = str(tmpdir.join("test_unpack_archive.txt"))
    with tio.open_sesame(fpath_txt, mode="wt") as f:
        f.write(data)
    fpath_zip = str(tmpdir.join("test_unpack_archive.zip"))
    with zipfile.ZipFile(fpath_zip, "w") as f:
        f.write(fpath_txt)
    unpack_archive(fpath_zip, extract_dir=tmpdir)
    fpath_tar = str(tmpdir.join("test_unpack_archive.tar"))
    with tarfile.TarFile(fpath_tar, "w") as f:
        f.add(fpath_txt)
    unpack_archive(fpath_tar, extract_dir=tmpdir)
    unpack_archive(fpath_txt, extract_dir=tmpdir)
github chartbeat-labs / textacy / tests / test_io.py View on Github external
def test_read_write_bytes_lines(self, tmpdir, spacy_doc):
        expected = [{"idx": i, "sent": sent.text} for i, sent in enumerate(spacy_doc.sents)]
        for ext in (".json", ".json.gz", ".json.bz2", ".json.xz"):
            filepath = str(tmpdir.join("test_read_write_json_lines_bytes" + ext))
            if compat.PY2 is True:
                if ext == ".json.xz":
                    with pytest.raises(ValueError):
                        io.open_sesame(
                            filepath, mode="wb", encoding="utf-8", make_dirs=True
                        )
                else:
                    io.write_json(expected, filepath, mode="wb", make_dirs=True, lines=True)
                    observed = list(io.read_json(filepath, mode="rb", lines=True))
                    assert observed == expected
            else:
                with pytest.raises(TypeError):
                    io.write_json(
                        expected,
                        filepath,
                        mode="wb",
                        encoding=None,
                        make_dirs=True,
                        lines=True,
                    )
github ebursztein / sitefab / tests / test_nlp.py View on Github external
def test_stats():
    text = "the quick fox and the cat. The turtle and the rabbit."
    doc = make_spacy_doc(text, lang=SPACY_MODEL)
    stats = nlp.compute_stats(doc)
    assert stats.counts.sentences == 2
    assert stats.counts.words == 11
github chartbeat-labs / textacy / tests / ke / test_sgrank.py View on Github external
def empty_spacy_doc():
    return textacy.make_spacy_doc("", lang="en")