How to use the textacy.utils.to_collection function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / test_utils.py View on Github external
def test_to_collection():
    in_outs = [
        [(1, int, list), [1]],
        [([1, 2], int, tuple), (1, 2)],
        [((1, 1.0), (int, float), set), {1, 1.0}],
    ]
    assert utils.to_collection(None, int, list) is None
    for in_, out_ in in_outs:
        assert utils.to_collection(*in_) == out_
github chartbeat-labs / textacy / textacy / lang_utils.py View on Github external
def identify_topn_langs(self, text, topn=3):
        """
        Identify the ``topn`` most probable languages identified in ``text``.

        Args:
            text (str)
            topn (int)

        Returns:
            List[Tuple[str, float]]: 2-letter language code and its probability
            for the ``topn`` most probable languages.
        """
        text_ = utils.to_collection(text[:self.max_text_len], str, list)
        if self._is_valid(text_[0]):
            lang_probs = sorted(
                zip(self.pipeline.classes_, self.pipeline.predict_proba(text_).flat),
                key=operator.itemgetter(1),
                reverse=True,
            )[:topn]
            return [(lang.item(), prob.item()) for lang, prob in lang_probs]
        else:
            return [("un", 1.0)]
github chartbeat-labs / textacy / textacy / ke / utils.py View on Github external
Args:
        doc (:class:`spacy.tokens.Doc`)
        ns (int or Tuple[int]): One or more n values for which to generate n-grams.
            For example, ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams.
        include_pos (str or Set[str]): One or more POS tags with which to filter ngrams.
            If None, include tokens of all POS tags.

    Yields:
        Tuple[:class:`spacy.tokens.Token`]: Next ngram candidate,
        as a tuple of constituent Tokens.

    See Also:
        :func:`textacy.extract.ngrams()`
    """
    ns = t_utils.to_collection(ns, int, tuple)
    include_pos = t_utils.to_collection(include_pos, compat.unicode_, set)
    ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns)
    ngrams = (
        ngram
        for ngram in ngrams
        if not (ngram[0].is_stop or ngram[-1].is_stop)
        and not any(word.is_punct or word.is_space for word in ngram)
    )
    if include_pos:
        ngrams = (
            ngram
            for ngram in ngrams
            if all(word.pos_ in include_pos for word in ngram)
        )
    for ngram in ngrams:
        yield ngram
github chartbeat-labs / textacy / textacy / augmentation / transforms.py View on Github external
through synonym substitution.
        num (int or float): If int, maximum number of words with available synonyms
            to substitute with a randomly selected synonym; if float, probability
            that a given word with synonyms will be substituted.
        pos (str or Set[str]): Part of speech tag(s) of words to be considered
            for augmentation. If None, all words with synonyms are considered.

    Returns:
        List[:obj:`AugTok`]: New, augmented sequence of tokens.

    Note:
        This transform requires :class:`textacy.resources.ConceptNet` to be downloaded
        to work properly, since this is the data source for word synonyms to be substituted.
    """
    _validate_aug_toks(aug_toks)
    pos = utils.to_collection(pos, str, set)
    cand_idxs = [
        idx for idx, aug_tok in enumerate(aug_toks)
        if aug_tok.syns and (pos is None or aug_tok.pos in pos)
    ]
    rand_idxs = set(_select_random_candidates(cand_idxs, num))
    if not rand_idxs:
        return aug_toks[:]

    new_aug_toks = []
    for idx, aug_tok in enumerate(aug_toks):
        if idx in rand_idxs:
            new_aug_toks.append(
                aug_utils.AugTok(
                    text=random.choice(aug_tok.syns),
                    ws=aug_tok.ws,
                    pos=aug_tok.pos,
github chartbeat-labs / textacy / textacy / augmentation / transforms.py View on Github external
Randomly delete words,
    up to ``num`` times or with a probability of ``num``.

    Args:
        aug_toks (List[:class:`AugTok`]): Sequence of tokens to augment
            through word deletion.
        num (int or float): If int, maximum number of words to delete;
            if float, probability that a given word will be deleted.
        pos (str or Set[str]): Part of speech tag(s) of words to be considered
            for augmentation. If None, all words are considered.

    Returns:
        List[:class:`AugTok`]: New, augmented sequence of tokens.
    """
    _validate_aug_toks(aug_toks)
    pos = utils.to_collection(pos, str, set)
    # bail out on very short sentences to avoid clobbering meaning
    if len(aug_toks) < 3:
        return aug_toks[:]

    cand_idxs = [
        idx for idx, aug_tok in enumerate(aug_toks)
        if aug_tok.is_word and (pos is None or aug_tok.pos in pos)
    ]
    rand_idxs = set(_select_random_candidates(cand_idxs, num))
    if not rand_idxs:
        return aug_toks[:]

    new_aug_toks = []
    padded_triplets = itertoolz.sliding_window(3, [None] + aug_toks + [None])
    for idx, (prev_tok, curr_tok, next_tok) in enumerate(padded_triplets):
        if idx in rand_idxs:
github chartbeat-labs / textacy / textacy / ke / scake.py View on Github external
topn (int or float): Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
        their corresponding scores.

    References:
        Duari, Swagata & Bhatnagar, Vasudha. (2018). sCAKE: Semantic Connectivity
        Aware Keyword Extraction. Information Sciences. 477.
        https://arxiv.org/abs/1811.10831v1
    """
    # validate / transform args
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    # build up a graph of good words, edges weighting by adjacent sentence co-occurrence
    cooc_mat = collections.Counter()
    n_sents = itertoolz.count(doc.sents)  # in case doc only has 1 sentence
    for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
        window_words = (
github chartbeat-labs / textacy / textacy / ke / sgrank.py View on Github external
Returns:
        List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
        their corresponding SGRank scores

    Raises:
        ValueError: if ``topn`` is a float but not in (0.0, 1.0] or
            ``window_size`` < 2

    References:
        Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical
        Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction."
        Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    # validate / transform args
    ngrams = utils.to_collection(ngrams, int, tuple)
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if window_size < 2:
        raise ValueError("`window_size` must be >= 2")
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "`topn` must be an int, or a float between 0.0 and 1.0"
            )

    n_toks = len(doc)
    window_size = min(n_toks, window_size)
    # bail out on (nearly) empty docs
    if n_toks < 2:
        return []

    candidates, candidate_counts = _get_candidates(doc, normalize, ngrams, include_pos)
github chartbeat-labs / textacy / textacy / datasets / imdb.py View on Github external
are included. Both low and high values must be specified, but a null value
                for either is automatically replaced by the minimum or maximum
                valid values, respectively.
            min_len (int): Filter movie reviews by the length (number of characters)
                of their text content.
            limit (int): Yield no more than ``limit`` movie reviews that match all
                specified filters.

        Yields:
            str: Text of the next movie review in dataset passing all filters.
            dict: Metadata of the next movie review in dataset passing all filters.

        Raises:
            ValueError: If any filtering options are invalid.
        """
        self._subset = utils.to_collection(subset, (str, bytes), tuple)
        self._label = utils.to_collection(label, (str, bytes), tuple)
        try:
            filters = self._get_filters(rating_range, min_len)
            for record in itertools.islice(self._filtered_iter(filters), limit):
                yield record.pop("text"), record
        finally:
            self._subset = None
            self._label = None
github chartbeat-labs / textacy / textacy / ke / sgrank.py View on Github external
Returns:
        List[Tuple[str, float]]: Sorted list of top ``topn`` key terms and
        their corresponding SGRank scores

    Raises:
        ValueError: if ``topn`` is a float but not in (0.0, 1.0] or
            ``window_size`` < 2

    References:
        Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical
        Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction."
        Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    # validate / transform args
    ngrams = utils.to_collection(ngrams, int, tuple)
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if window_size < 2:
        raise ValueError("`window_size` must be >= 2")
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "`topn` must be an int, or a float between 0.0 and 1.0"
            )

    n_toks = len(doc)
    window_size = min(n_toks, window_size)
    # bail out on (nearly) empty docs
    if n_toks < 2:
        return []

    candidates, candidate_counts = _get_candidates(doc, normalize, ngrams, include_pos)
    # scale float topn based on total number of initial candidates