How to use the textacy.extract.words function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_pos(self, spacy_doc):
        result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
        result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
        assert all(tok.pos_ == "NOUN" for tok in result1)
        assert all(tok.pos_ == "NOUN" for tok in result2)
        result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"}))
        result4 = list(extract.words(spacy_doc, exclude_pos="NOUN"))
        assert not any(tok.pos_ == "NOUN" for tok in result3)
        assert not any(tok.pos_ == "NOUN" for tok in result4)
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_min_freq(self, spacy_doc):
        counts = collections.Counter()
        counts.update(tok.lower_ for tok in spacy_doc)
        result = list(extract.words(spacy_doc, min_freq=2))
        assert all(counts[tok.lower_] >= 2 for tok in result)
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_pos(self, spacy_doc):
        result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
        result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
        assert all(tok.pos_ == "NOUN" for tok in result1)
        assert all(tok.pos_ == "NOUN" for tok in result2)
        result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"}))
        result4 = list(extract.words(spacy_doc, exclude_pos="NOUN"))
        assert not any(tok.pos_ == "NOUN" for tok in result3)
        assert not any(tok.pos_ == "NOUN" for tok in result4)
github chartbeat-labs / textacy / textacy / spacier / doc_extensions.py View on Github external
{"lemma", "lower", types.FunctionType, None},
            )
        )
    if ngrams:
        unigrams_ = []
        ngrams_ = []
        ng_kwargs = {
            "filter_stops", "filter_punct", "filter_nums",
            "include_pos", "exclude_pos",
            "min_freq",
        }
        ng_kwargs = {key: val for key, val in kwargs.items() if key in ng_kwargs}
        for n in sorted(utils.to_collection(ngrams, int, set)):
            # use a faster function for unigrams
            if n == 1:
                unigrams_ = extract.words(doc, **ng_kwargs)
            else:
                ngrams_.append(extract.ngrams(doc, n, **ng_kwargs))
        ngrams_ = itertoolz.concat(ngrams_)
    if entities is not None:
        ent_kwargs = {"include_types", "exclude_types", "drop_determiners", "min_freq"}
        ent_kwargs = {key: val for key, val in kwargs.items() if key in ent_kwargs}
        entities_ = extract.entities(doc, **ent_kwargs)
    if ngrams:
        # use ngrams as-is
        if entities is None:
            terms = itertoolz.concatv(unigrams_, ngrams_)
        # remove unigrams + ngrams that are duplicates of entities
        else:
            entities_ = tuple(entities_)
            ent_idxs = {(ent.start, ent.end) for ent in entities_}
            unigrams_ = (
github chartbeat-labs / textacy / textacy / text_stats.py View on Github external
def __init__(self, doc):
        self.lang = doc.vocab.lang
        self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None
        # get objs for basic count computations
        hyphenator = cache.load_hyphenator(lang=self.lang)
        words = tuple(
            extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)
        )
        syllables_per_word = tuple(
            len(hyphenator.positions(word.lower_)) + 1 for word in words
        )
        chars_per_word = tuple(len(word) for word in words)
        # compute basic counts needed for most readability stats
        self.n_words = len(words)
        self.n_unique_words = len({word.lower for word in words})
        self.n_chars = sum(chars_per_word)
        self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
        self.n_syllables = sum(syllables_per_word)
        self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1)
        self.n_polysyllable_words = sum(1 for spw in syllables_per_word if spw >= 3)
github chartbeat-labs / textacy / textacy / similarity.py View on Github external
where larger values correspond to more similar documents.

    References:
        - Ofir Pele and Michael Werman, "A linear time histogram metric for improved
          SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        - Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
          in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        - Kusner, Matt J., et al. "From word embeddings to document distances."
          Proceedings of the 32nd International Conference on Machine Learning
          (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    word_idxs = dict()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector and word_idxs.setdefault(word.orth, n) == n:
            word_vecs.append(word.vector)
            n += 1
    distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
        np.double
    )
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(
        word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
    )
    vec1 = np.array(
        [vec1[word_idx] for word_idx in range(len(word_idxs))]
    ).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts
github chartbeat-labs / textacy / textacy / similarity.py View on Github external
"""
    word_idxs = dict()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector and word_idxs.setdefault(word.orth, n) == n:
            word_vecs.append(word.vector)
            n += 1
    distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
        np.double
    )
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(
        word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
    )
    vec1 = np.array(
        [vec1[word_idx] for word_idx in range(len(word_idxs))]
    ).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(
        word_idxs[word.orth] for word in extract.words(doc2) if word.has_vector
    )
    vec2 = np.array(
        [vec2[word_idx] for word_idx in range(len(word_idxs))]
    ).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
github chartbeat-labs / textacy / textacy / similarity.py View on Github external
n += 1
    distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(
        np.double
    )
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(
        word_idxs[word.orth] for word in extract.words(doc1) if word.has_vector
    )
    vec1 = np.array(
        [vec1[word_idx] for word_idx in range(len(word_idxs))]
    ).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(
        word_idxs[word.orth] for word in extract.words(doc2) if word.has_vector
    )
    vec2 = np.array(
        [vec2[word_idx] for word_idx in range(len(word_idxs))]
    ).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
github chartbeat-labs / textacy / textacy / topic_modeling.py View on Github external
Args:
        spacy_docs (iterable(``spacy.Doc``))
        lemmatize (bool, optional)
        filter_stops (bool, optional)
        filter_punct (bool, optional)
        filter_nums (bool, optional)
        good_pos_tags (set(str) or 'numeric', optional)
        bad_pos_tags (set(str) or 'numeric', optional)

    Yields:
        list(str)
    """
    for spacy_doc in spacy_docs:
        if lemmatize is True:
            yield [word.lemma_ for word in
                   extract.words(spacy_doc,
                                 filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums,
                                 good_pos_tags=good_pos_tags, bad_pos_tags=bad_pos_tags)]
        else:
            yield [word.orth_ for word in
                   extract.words(spacy_doc,
                                 filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums,
                                 good_pos_tags=good_pos_tags, bad_pos_tags=bad_pos_tags)]