How to use the cytoolz.itertoolz function in cytoolz

To help you get started, we’ve selected a few cytoolz examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / ke / scake.py View on Github external
include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    # build up a graph of good words, edges weighting by adjacent sentence co-occurrence
    cooc_mat = collections.Counter()
    n_sents = itertoolz.count(doc.sents)  # in case doc only has 1 sentence
    for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
        window_words = (
            word
            for word in itertoolz.concatv(sent1, sent2)
            if not (word.is_stop or word.is_punct or word.is_space)
            and (not include_pos or word.pos_ in include_pos)
        )
        window_words = ke_utils.normalize_terms(window_words, normalize)
        cooc_mat.update(
            w1_w2
            for w1_w2 in itertools.combinations(sorted(window_words), 2)
            if w1_w2[0] != w1_w2[1]
        )
    # doc doesn't have any valid words...
    if not cooc_mat:
        return []
github chartbeat-labs / textacy / textacy / augmentation / transforms.py View on Github external
#     pos = set(aug_tok.pos for aug_tok in aug_toks if aug_tok.is_word)
    # cand_idx_pairs = list(
    #     itertools.chain.from_iterable(
    #         itertools.combinations(
    #             (idx for idx, aug_tok in enumerate(aug_toks) if aug_tok.pos == pos_),
    #             2,
    #         )
    #         for pos_ in pos
    #     )
    # )
    cand_idxs = (
        idx for idx, aug_tok in enumerate(aug_toks)
        if aug_tok.is_word and (pos is None or aug_tok.pos in pos)
    )
    cand_idx_pairs = [
        (idx1, idx2) for idx1, idx2 in itertoolz.sliding_window(2, cand_idxs)
        if idx2 - idx1 == 1
    ]
    rand_idx_pairs = _select_random_candidates(cand_idx_pairs, num)
    if not rand_idx_pairs:
        return aug_toks[:]

    new_aug_toks = aug_toks[:]
    for idx1, idx2 in rand_idx_pairs:
        tok1 = new_aug_toks[idx1]
        tok2 = new_aug_toks[idx2]
        new_aug_toks[idx1] = aug_utils.AugTok(
            text=tok2.text,
            ws=tok1.ws,
            pos=tok2.pos,
            is_word=tok2.is_word,
            syns=tok2.syns,
github chartbeat-labs / textacy / textacy / augmentation / transforms.py View on Github external
# bail out on very short sentences to avoid clobbering meaning
    if len(aug_toks) < 3:
        return aug_toks[:]

    cand_aug_toks = [
        aug_tok for aug_tok in aug_toks
        if aug_tok.syns and (pos is None or aug_tok.pos in pos)
    ]
    rand_aug_toks = _select_random_candidates(cand_aug_toks, num)
    rand_idxs = random.sample(range(len(aug_toks)), len(rand_aug_toks))
    if not rand_idxs:
        return aug_toks[:]

    rand_aug_toks = iter(rand_aug_toks)
    new_aug_toks = []
    for idx, (prev_tok, curr_tok) in enumerate(itertoolz.sliding_window(2, [None] + aug_toks)):
        if idx in rand_idxs:
            rand_aug_tok = next(rand_aug_toks)
            if prev_tok:
                # use previous token's whitespace for inserted synonym
                new_tok_ws = prev_tok.ws
                if prev_tok.is_word and not prev_tok.ws:
                    # previous token should have whitespace, if a word
                    new_aug_toks[-1] = aug_utils.AugTok(
                        text=prev_tok.text,
                        ws=" ",
                        pos=prev_tok.pos,
                        is_word=True,
                        syns=prev_tok.syns,
                    )
            else:
                new_tok_ws = " "
github chartbeat-labs / textacy / textacy / io / utils.py View on Github external
def unzip(seq):
    """
    Borrowed from ``toolz.sandbox.core.unzip``, but using cytoolz instead of toolz
    to avoid the additional dependency.
    """
    seq = iter(seq)
    # check how many iterators we need
    try:
        first = tuple(next(seq))
    except StopIteration:
        return tuple()
    # and create them
    niters = len(first)
    seqs = itertools.tee(itertoolz.cons(first, seq), niters)
    return tuple(itertools.starmap(itertoolz.pluck, enumerate(seqs)))
github chartbeat-labs / textacy / textacy / io / utils.py View on Github external
def unzip(seq):
    """
    Borrowed from ``toolz.sandbox.core.unzip``, but using cytoolz instead of toolz
    to avoid the additional dependency.
    """
    seq = iter(seq)
    # check how many iterators we need
    try:
        first = tuple(next(seq))
    except StopIteration:
        return tuple()
    # and create them
    niters = len(first)
    seqs = itertools.tee(itertoolz.cons(first, seq), niters)
    return tuple(itertools.starmap(itertoolz.pluck, enumerate(seqs)))
github chartbeat-labs / textacy / textacy / ke / yake.py View on Github external
normalize (str)
        stop_words (Set[str])
        window_size (int)

    Returns:
        Dict[int, Dict[str, list]]
    """
    word_occ_vals = collections.defaultdict(lambda: collections.defaultdict(list))

    def _is_upper_cased(tok):
        return tok.is_upper or (tok.is_title and not tok.is_sent_start)

    attr_name = _get_attr_name(normalize, False)
    padding = [None] * window_size
    for sent_idx, sent in enumerate(doc.sents):
        sent_padded = itertoolz.concatv(padding, sent, padding)
        for window in itertoolz.sliding_window(1 + (2 * window_size), sent_padded):
            lwords, word, rwords = window[:window_size], window[window_size], window[window_size + 1:]
            w_id = getattr(word, attr_name)
            if word.is_stop:
                stop_words.add(w_id)
            word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word))
            word_occ_vals[w_id]["sent_idx"].append(sent_idx)
            word_occ_vals[w_id]["l_context"].extend(
                getattr(w, attr_name) for w in lwords
                if not (w is None or w.is_punct or w.is_space)
            )
            word_occ_vals[w_id]["r_context"].extend(
                getattr(w, attr_name) for w in rwords
                if not (w is None or w.is_punct or w.is_space)
            )
    return word_occ_vals