How to use the cytoolz.itertoolz.count function in cytoolz

To help you get started, we’ve selected a few cytoolz examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
"""
    Args:
        dirpath (str or :class:`pathlib.Path`)
        langs (Set[str])
        min_len (int)

    Returns:
        List[Tuple[str, str]]
    """
    ds = textacy.datasets.UDHR(data_dir=dirpath)
    data = [
        (snippet, meta["lang"])
        for text, meta in ds.records()
        for snippet in text.split("\n")
        if meta["lang"] in langs
        and itertoolz.count(char for char in snippet if char.isalnum()) >= min_len
    ]
    return data
github chartbeat-labs / textacy / textacy / corpus.py View on Github external
def _add_valid_doc(self, doc):
        self.docs.append(doc)
        self._doc_ids.append(id(doc))
        self.n_docs += 1
        self.n_tokens += len(doc)
        if doc.is_sentenced:
            self.n_sents += itertoolz.count(doc.sents)
github chartbeat-labs / textacy / textacy / text_stats.py View on Github external
def __init__(self, doc):
        self.lang = doc.vocab.lang
        self.n_sents = itertoolz.count(doc.sents) if doc.is_sentenced else None
        # get objs for basic count computations
        hyphenator = cache.load_hyphenator(lang=self.lang)
        words = tuple(
            extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)
        )
        syllables_per_word = tuple(
            len(hyphenator.positions(word.lower_)) + 1 for word in words
        )
        chars_per_word = tuple(len(word) for word in words)
        # compute basic counts needed for most readability stats
        self.n_words = len(words)
        self.n_unique_words = len({word.lower for word in words})
        self.n_chars = sum(chars_per_word)
        self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
        self.n_syllables = sum(syllables_per_word)
        self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1)
github chartbeat-labs / textacy / textacy / corpus.py View on Github external
def _remove_one_doc_by_index(self, idx):
        doc = self.docs[idx]
        self.n_docs -= 1
        self.n_tokens -= len(doc)
        if doc.is_sentenced:
            self.n_sents -= itertoolz.count(doc.sents)
        del self.docs[idx]
        del self._doc_ids[idx]
github chartbeat-labs / textacy / textacy / ke / scake.py View on Github external
# validate / transform args
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    # build up a graph of good words, edges weighting by adjacent sentence co-occurrence
    cooc_mat = collections.Counter()
    n_sents = itertoolz.count(doc.sents)  # in case doc only has 1 sentence
    for sent1, sent2 in itertoolz.sliding_window(min(2, n_sents), doc.sents):
        window_words = (
            word
            for word in itertoolz.concatv(sent1, sent2)
            if not (word.is_stop or word.is_punct or word.is_space)
            and (not include_pos or word.pos_ in include_pos)
        )
        window_words = ke_utils.normalize_terms(window_words, normalize)
        cooc_mat.update(
            w1_w2
            for w1_w2 in itertools.combinations(sorted(window_words), 2)
            if w1_w2[0] != w1_w2[1]
        )
    # doc doesn't have any valid words...
    if not cooc_mat:
        return []
github chartbeat-labs / textacy / textacy / keyterms.py View on Github external
)
            word_vals[wid]["right_context"].extend(
                w.lower for w in rwords
                if not (w is None or w.is_punct or w.is_space)
            )

    # compute word frequencies and aggregated statistics
    word_freqs = {wid: len(vals["is_upper_cased"]) for wid, vals in word_vals.items()}
    freqs_nsw = [freq for wid, freq in word_freqs.items() if wid not in stop_words]
    freqs_max = max(word_freqs.values())
    freq_mean = compat.mean_(freqs_nsw)
    freq_stdev = compat.stdev_(freqs_nsw)

    # compute per-word weights
    word_weights = collections.defaultdict(dict)
    n_sents = itertoolz.count(doc.sents)
    for wid, vals in word_vals.items():
        freq = word_freqs[wid]
        word_weights[wid]["case"] = sum(vals["is_upper_cased"]) / math.log2(1 + freq)
        word_weights[wid]["pos"] = math.log2(math.log2(3 + compat.median_(vals["sent_idx"])))
        word_weights[wid]["freq"] = freq / (freq_mean + freq_stdev)
        n_unique_lc = len(set(vals["left_context"]))
        n_unique_rc = len(set(vals["right_context"]))
        try:
            wl = n_unique_lc / len(vals["left_context"])
        except ZeroDivisionError:
            wl = 0.0
        try:
            wr = n_unique_rc / len(vals["right_context"])
        except ZeroDivisionError:
            wr = 0.0
        pl = n_unique_lc / freqs_max
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
"""
    dirpath = textacy.utils.to_path(dirpath).resolve()
    raw_tweets = textacy.io.read_json(
        dirpath.joinpath("tweets.jsonl"), mode="rt", lines=True)
    tweets = []
    for tweet in raw_tweets:
        # totally remove any URLS from tweet text
        for url in tweet.get("urls", []):
            for item in url.values():
                tweet["text"] = tweet["text"].replace(item, "")
        tweets.append(tweet)
    ds = [
        (tweet["text"], tweet["lang"])
        for tweet in tweets
        if tweet["lang"] in langs
        and itertoolz.count(char for char in tweet["text"] if char.isalnum()) >= min_len
    ]
    return ds