How to use the cytoolz.itertoolz.sliding_window function in cytoolz

To help you get started, we’ve selected a few cytoolz examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vlukiyanov / pt-sdae / ptsdae / sdae.py View on Github external
layer followed by an activation layer.

    :param dimensions: iterable of dimensions for the chain
    :param activation: activation layer to use e.g. nn.ReLU, set to None to disable
    :return: list of instances of Sequential
    """

    def single_unit(in_dimension: int, out_dimension: int) -> torch.nn.Module:
        unit = [("linear", nn.Linear(in_dimension, out_dimension))]
        if activation is not None:
            unit.append(("activation", activation))
        return nn.Sequential(OrderedDict(unit))

    return [
        single_unit(embedding_dimension, hidden_dimension)
        for embedding_dimension, hidden_dimension in sliding_window(2, dimensions)
    ]
github chartbeat-labs / textacy / textacy / network.py View on Github external
return nx.Graph()

    # if len(terms) < window_width, cytoolz throws a StopIteration error
    # which we don't want
    if len(terms) < window_width:
        LOGGER.info(
            "`terms` has fewer items (%s) than the specified `window_width` (%s); "
            "setting window width to %s",
            len(terms),
            window_width,
            len(terms),
        )
        window_width = len(terms)

    if isinstance(terms[0], compat.unicode_):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], Token):
        if normalize == "lemma":
            windows = (
                (tok.lemma_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms)
            )
        elif normalize == "lower":
            windows = (
                (tok.lower_ for tok in window)
                for window in itertoolz.sliding_window(window_width, terms)
            )
        elif not normalize:
            windows = (
                (tok.text for tok in window)
                for window in itertoolz.sliding_window(window_width, terms)
            )
github chartbeat-labs / textacy / textacy / ke / utils.py View on Github external
doc (:class:`spacy.tokens.Doc`)
        ns (int or Tuple[int]): One or more n values for which to generate n-grams.
            For example, ``2`` gets bigrams; ``(2, 3)`` gets bigrams and trigrams.
        include_pos (str or Set[str]): One or more POS tags with which to filter ngrams.
            If None, include tokens of all POS tags.

    Yields:
        Tuple[:class:`spacy.tokens.Token`]: Next ngram candidate,
        as a tuple of constituent Tokens.

    See Also:
        :func:`textacy.extract.ngrams()`
    """
    ns = t_utils.to_collection(ns, int, tuple)
    include_pos = t_utils.to_collection(include_pos, compat.unicode_, set)
    ngrams = itertoolz.concat(itertoolz.sliding_window(n, doc) for n in ns)
    ngrams = (
        ngram
        for ngram in ngrams
        if not (ngram[0].is_stop or ngram[-1].is_stop)
        and not any(word.is_punct or word.is_space for word in ngram)
    )
    if include_pos:
        ngrams = (
            ngram
            for ngram in ngrams
            if all(word.pos_ in include_pos for word in ngram)
        )
    for ngram in ngrams:
        yield ngram
github chartbeat-labs / textacy / textacy / keyterms.py View on Github external
words = (
                word for word in words
                if word.pos_ in include_pos
            )
        for word in words:
            wid = word.lower
            if wid in stop_words or wid in seen_candidates:
                continue
            else:
                seen_candidates.add(wid)
            # NOTE: here i've modified the YAKE algorithm to put less emphasis on term freq
            # term_scores[word.lower_] = word_scores[wid] / (word_freqs[wid] * (1 + word_scores[wid]))
            term_scores[word.lower_] = word_scores[wid] / (math.log2(1 + word_freqs[wid]) * (1 + word_scores[wid]))

    # now compute combined scores for (valid) bigram and trigram and candidates
    ngrams = itertoolz.concatv(*(itertoolz.sliding_window(n, doc) for n in ngrams if n > 1))
    ngrams = [
        ngram
        for ngram in ngrams
        if not (ngram[0].is_stop or ngram[-1].is_stop)
        and not any(w.is_punct or w.is_space for w in ngram)
    ]
    if include_pos:
        ngrams = [
            ngram
            for ngram in ngrams
            if all(w.pos_ in include_pos for w in ngram)
        ]
    ngram_freqs = itertoolz.frequencies(
        " ".join(word.lower_ for word in ngram)
        for ngram in ngrams)
    for ngram in ngrams:
github chartbeat-labs / textacy / textacy / ke / yake.py View on Github external
stop_words (Set[str])
        window_size (int)

    Returns:
        Dict[int, Dict[str, list]]
    """
    word_occ_vals = collections.defaultdict(lambda: collections.defaultdict(list))

    def _is_upper_cased(tok):
        return tok.is_upper or (tok.is_title and not tok.is_sent_start)

    attr_name = _get_attr_name(normalize, False)
    padding = [None] * window_size
    for sent_idx, sent in enumerate(doc.sents):
        sent_padded = itertoolz.concatv(padding, sent, padding)
        for window in itertoolz.sliding_window(1 + (2 * window_size), sent_padded):
            lwords, word, rwords = window[:window_size], window[window_size], window[window_size + 1:]
            w_id = getattr(word, attr_name)
            if word.is_stop:
                stop_words.add(w_id)
            word_occ_vals[w_id]["is_uc"].append(_is_upper_cased(word))
            word_occ_vals[w_id]["sent_idx"].append(sent_idx)
            word_occ_vals[w_id]["l_context"].extend(
                getattr(w, attr_name) for w in lwords
                if not (w is None or w.is_punct or w.is_space)
            )
            word_occ_vals[w_id]["r_context"].extend(
                getattr(w, attr_name) for w in rwords
                if not (w is None or w.is_punct or w.is_space)
            )
    return word_occ_vals
github vlukiyanov / pt-sdae / ptsdae / utils.py View on Github external
def __init__(self, dimensions: List[int]):
        super(Classifier, self).__init__()
        units = []
        for from_dimension, to_dimension in sliding_window(2, dimensions):
            units.append(nn.Linear(from_dimension, to_dimension))
            units.append(nn.ReLU())
        self.classifier = nn.Sequential(*units[:-1])
        self.softmax = nn.LogSoftmax(dim=1)
github chartbeat-labs / textacy / textacy / ke / graph_base.py View on Github external
return nx.Graph()

    # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it
    if len(terms) < window_size:
        LOGGER.info(
            "`terms` has fewer items (%s) than `window_size` (%s); "
            "setting window width to %s",
            len(terms),
            window_size,
            len(terms),
        )
        window_size = len(terms)

    first_term, terms = itertoolz.peek(terms)
    if isinstance(first_term, compat.unicode_):
        windows = itertoolz.sliding_window(window_size, terms)
    elif isinstance(first_term, (Span, Token)):
        windows = itertoolz.sliding_window(
            window_size, utils.normalize_terms(terms, normalize))
    else:
        raise TypeError(
            "items in `terms` must be strings or spacy tokens, not {}".format(
                type(first_term)
            )
        )

    graph = nx.Graph()
    if edge_weighting == "count":
        cooc_mat = collections.Counter(
            w1_w2
            for window in windows
            for w1_w2 in itertools.combinations(sorted(window), 2)