How to use the textacy.extract function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / keyterms.py View on Github external
)
        n_keyterms = int(round(n_toks * n_keyterms))
    if window_width < 2:
        raise ValueError("`window_width` must be >= 2")
    window_width = min(n_toks, window_width)
    min_term_freq = min(n_toks // 1000, 4)
    if isinstance(ngrams, int):
        ngrams = (ngrams,)

    # build full list of candidate terms
    # if inverse doc freqs available, include nouns, adjectives, and verbs;
    # otherwise, just include nouns and adjectives
    # (without IDF downweighting, verbs dominate the results in a bad way)
    include_pos = {"NOUN", "PROPN", "ADJ", "VERB"} if idf else {"NOUN", "PROPN", "ADJ"}
    terms = itertoolz.concat(
        extract.ngrams(
            doc,
            n,
            filter_stops=True,
            filter_punct=True,
            filter_nums=False,
            include_pos=include_pos,
            min_freq=min_term_freq,
        )
        for n in ngrams
    )

    # get normalized term strings, as desired
    # paired with positional index in document and length in a 3-tuple
    if normalize == "lemma":
        terms = [(term.lemma_, term.start, len(term)) for term in terms]
    elif normalize == "lower":
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_default(self, spacy_doc):
        # TODO: figure out if this function no longer works, ugh
        # expected = {"I.M.F.": "International Monetary Fund"}
        expected = {"I.M.F.": ""}
        observed = extract.acronyms_and_definitions(spacy_doc)
        assert observed == expected
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_determiner(self, spacy_doc):
        result = list(extract.noun_chunks(spacy_doc, drop_determiners=False))
        assert all(isinstance(span, Span) for span in result)
        assert any(span[0].pos_ == "DET" for span in result)
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_patdict_bool_int(self, spacy_doc):
        matches = list(extract.matches(spacy_doc, [{"IS_DIGIT": True}]))[:5]
        assert matches
        assert all(span[0].is_digit is True for span in matches)
        matches = list(extract.matches(spacy_doc, [{"LENGTH": 5}]))[:5]
        assert matches
        assert all(len(span[0]) == 5 for span in matches)
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
def test_default(self, spacy_doc):
        result = list(extract.words(spacy_doc))
        assert all(isinstance(tok, Token) for tok in result)
        assert not any(tok.is_space for tok in result)
github chartbeat-labs / textacy / tests / test_extract.py View on Github external
(
                "IS_DIGIT:bool(True):? POS:NOUN:*",
                [{"IS_DIGIT": True, "OP": "?"}, {"POS": "NOUN", "OP": "*"}],
            ),
            (
                "LENGTH:int(5) DEP:nsubj:!",
                [{"LENGTH": 5}, {"DEP": "nsubj", "OP": "!"}],
            ),
            ("POS:DET :", [{"POS": "DET"}, {}]),
            (
                "IS_PUNCT:bool(False) : IS_PUNCT:bool(True)",
                [{"IS_PUNCT": False}, {}, {"IS_PUNCT": True}],
            ),
        ]
        for patstr, pat in patstr_to_pats:
            assert extract._make_pattern_from_string(patstr) == pat
github chartbeat-labs / textacy / textacy / topic_modeling.py View on Github external
merge_ncs (bool, optional): if True, merge noun chunks into single tokens

    Yields:
        ``spacy.Doc``: doc processed from next text in ``texts``
    """
    spacy_nlp = data.load_spacy_pipeline(
        lang=lang, entity=merge_nes, parser=merge_ncs)
    for spacy_doc in spacy_nlp.pipe(texts, tag=True, parse=merge_ncs, entity=merge_nes,
                                    n_threads=2, batch_size=1000):
        if merge_nes is True:
            spacy_utils.merge_spans(
                extract.named_entities(
                spacy_doc, bad_ne_types='numeric', drop_determiners=False))
        if merge_ncs is True:
            spacy_utils.merge_spans(
                extract.noun_chunks(
                    spacy_doc, drop_determiners=False))
        yield spacy_doc
github eea / eea.corpus / corpus / load_eea_corpus.py View on Github external
def extras(corpus):
    print('Corpus: ', corpus)

    # find published docs
    for doc in corpus.get(published_match_func, limit=3):
        triples = textacy.extract.subject_verb_object_triples(doc)
        print('Published doc: ', doc, list(triples))

    # find doc with specific url
    url = 'http://www.eea.europa.eu/publications/C23I92-826-5409-5'
    for doc in corpus.get(url_match_func(url), limit=3):
        print('specific url:', doc)

    # get terms list
    for doc in corpus.get(url_match_func(url), limit=3):
        tlist = doc.to_terms_list(
            ngrams=1, named_entities=True, as_strings=True
        )
        terms = list(tlist)
        print(terms)
github chartbeat-labs / textacy / textacy / spacier / doc_extensions.py View on Github external
"filter_stops", "filter_punct", "filter_nums",
            "include_pos", "exclude_pos",
            "min_freq",
        }
        ng_kwargs = {key: val for key, val in kwargs.items() if key in ng_kwargs}
        for n in sorted(utils.to_collection(ngrams, int, set)):
            # use a faster function for unigrams
            if n == 1:
                unigrams_ = extract.words(doc, **ng_kwargs)
            else:
                ngrams_.append(extract.ngrams(doc, n, **ng_kwargs))
        ngrams_ = itertoolz.concat(ngrams_)
    if entities is not None:
        ent_kwargs = {"include_types", "exclude_types", "drop_determiners", "min_freq"}
        ent_kwargs = {key: val for key, val in kwargs.items() if key in ent_kwargs}
        entities_ = extract.entities(doc, **ent_kwargs)
    if ngrams:
        # use ngrams as-is
        if entities is None:
            terms = itertoolz.concatv(unigrams_, ngrams_)
        # remove unigrams + ngrams that are duplicates of entities
        else:
            entities_ = tuple(entities_)
            ent_idxs = {(ent.start, ent.end) for ent in entities_}
            unigrams_ = (
                ug
                for ug in unigrams_
                if (ug.i, ug.i + 1) not in ent_idxs
            )
            ngrams_ = (
                ng
                for ng in ngrams_
github chartbeat-labs / textacy / textacy / ke / utils.py View on Github external
that match any pattern in ``patterns``

    Args:
        doc (:class:`spacy.tokens.Doc`)
        patterns (str or List[str] or List[dict] or List[List[dict]]):
            One or multiple patterns to match against ``doc``
            using a :class:`spacy.matcher.Matcher`.

    Yields:
        Tuple[:class:`spacy.tokens.Token`]: Next pattern-matching candidate,
        as a tuple of constituent Tokens.

    See Also:
        :func:`textacy.extract.matches()`
    """
    for match in extract.matches(doc, patterns, on_match=None):
        yield tuple(match)