How to use the textacy.compat function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / tests / datasets / test_imdb.py View on Github external
def test_records():
    for text, meta in DATASET.records(limit=3):
        assert isinstance(text, compat.unicode_)
        assert isinstance(meta, dict)
github chartbeat-labs / textacy / tests / datasets / test_capitol_words.py View on Github external
def test_texts():
    for text in DATASET.texts(limit=3):
        assert isinstance(text, compat.unicode_)
github chartbeat-labs / textacy / textacy / ke / graph_base.py View on Github external
LOGGER.warning("input `terms` is empty, so output graph is also empty")
        return nx.Graph()

    # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it
    if len(terms) < window_size:
        LOGGER.info(
            "`terms` has fewer items (%s) than `window_size` (%s); "
            "setting window width to %s",
            len(terms),
            window_size,
            len(terms),
        )
        window_size = len(terms)

    first_term, terms = itertoolz.peek(terms)
    if isinstance(first_term, compat.unicode_):
        windows = itertoolz.sliding_window(window_size, terms)
    elif isinstance(first_term, (Span, Token)):
        windows = itertoolz.sliding_window(
            window_size, utils.normalize_terms(terms, normalize))
    else:
        raise TypeError(
            "items in `terms` must be strings or spacy tokens, not {}".format(
                type(first_term)
            )
        )

    graph = nx.Graph()
    if edge_weighting == "count":
        cooc_mat = collections.Counter(
            w1_w2
            for window in windows
github chartbeat-labs / textacy / textacy / datasets / utils.py View on Github external
def get_filename_from_url(url):
    """
    Derive a filename from a URL's path.

    Args:
        url (str): URL from which to extract a filename.

    Returns:
        str: Filename in URL.
    """
    return os.path.basename(compat.urlparse(compat.url_unquote_plus(url)).path)
github chartbeat-labs / textacy / textacy / ke / textrank.py View on Github external
# validate / transform args
    include_pos = utils.to_collection(include_pos, compat.unicode_, set)
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(
                "topn={} is invalid; "
                "must be an int, or a float between 0.0 and 1.0".format(topn)
            )

    # bail out on empty docs
    if not doc:
        return []

    if position_bias is True:
        word_pos = collections.defaultdict(float)
        for word, norm_word in compat.zip_(doc, ke_utils.normalize_terms(doc, normalize)):
            word_pos[norm_word] += 1 / (word.i + 1)
        sum_word_pos = sum(word_pos.values())
        word_pos = {word: pos / sum_word_pos for word, pos in word_pos.items()}
    else:
        word_pos = None
    # build a graph from all words in doc, then score them
    graph = graph_base.build_graph_from_terms(
        [word for word in doc],
        normalize=normalize,
        window_size=window_size,
        edge_weighting=edge_weighting,
    )
    word_scores = graph_base.rank_nodes_by_pagerank(
        graph, weight="weight", personalization=word_pos)
    # generate a list of candidate terms
    candidates = _get_candidates(doc, normalize, include_pos)
github chartbeat-labs / textacy / textacy / ke / yake.py View on Github external
of several components, then combine components into per-word scores.

    Args:
        doc (:class:`spacy.tokens.Doc`)
        word_occ_vals (Dict[int, Dict[str, list]])
        word_freqs (Dict[int, int])
        stop_words (Set[str])

    Returns:
        Dict[int, float]
    """
    word_weights = collections.defaultdict(dict)
    # compute summary stats for word frequencies
    freqs_nsw = [freq for w_id, freq in word_freqs.items() if w_id not in stop_words]
    freq_max = max(word_freqs.values())
    freq_baseline = compat.mean_(freqs_nsw) + compat.stdev_(freqs_nsw)
    n_sents = itertoolz.count(doc.sents)
    for w_id, vals in word_occ_vals.items():
        freq = word_freqs[w_id]
        word_weights[w_id]["case"] = sum(vals["is_uc"]) / compat.log2_(1 + freq)
        word_weights[w_id]["pos"] = compat.log2_(compat.log2_(3 + compat.median_(vals["sent_idx"])))
        word_weights[w_id]["freq"] = freq / freq_baseline
        word_weights[w_id]["disp"] = len(set(vals["sent_idx"])) / n_sents
        n_unique_lc = len(set(vals["l_context"]))
        n_unique_rc = len(set(vals["r_context"]))
        try:
            wl = n_unique_lc / len(vals["l_context"])
        except ZeroDivisionError:
            wl = 0.0
        try:
            wr = n_unique_rc / len(vals["r_context"])
        except ZeroDivisionError:
github chartbeat-labs / textacy / textacy / io / write.py View on Github external
make_dirs (bool)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing
        delimiter (str): 1-character string used to separate fields in a row

    See Also:
        https://docs.python.org/3/library/csv.html#csv.writer

    Note:
        Here, CSV is used as a catch-all term for *any* delimited file
        format, and ``delimiter=','`` is merely the function's default value.
        Other common delimited formats are TSV (tab-separated-value, with
        ``delimiter='\\t'``) and PSV (pipe-separated-value, with ``delimiter='|'``.
    """
    with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f:
        csv_writer = compat.csv.writer(f, dialect=dialect, delimiter=delimiter)
        csv_writer.writerows(rows)
github chartbeat-labs / textacy / textacy / spacier / utils.py View on Github external
depending on how much RAM you have available.

            .. note:: Since chunking is done by character, chunks edges' probably
               won't respect natural language segmentation, which means that every
               ``chunk_size`` characters, spaCy will probably get tripped up and
               make weird parsing errors.

    Returns:
        :class:`spacy.tokens.Doc`: A single processed document, initialized from
        components accumulated chunk by chunk.
    """
    if isinstance(lang, compat.unicode_):
        lang = cache.load_spacy_lang(lang)
    elif not isinstance(lang, Language):
        raise TypeError(
            "`lang` must be {}, not {}".format({compat.unicode_, Language}, type(lang))
        )

    words = []
    spaces = []
    np_arrays = []
    cols = [attrs.POS, attrs.TAG, attrs.DEP, attrs.HEAD, attrs.ENT_IOB, attrs.ENT_TYPE]
    text_len = len(text)
    i = 0
    # iterate over text chunks and accumulate components needed to make a doc
    while i < text_len:
        chunk_doc = lang(text[i : i + chunk_size])
        words.extend(tok.text for tok in chunk_doc)
        spaces.extend(bool(tok.whitespace_) for tok in chunk_doc)
        np_arrays.append(chunk_doc.to_array(cols))
        i += chunk_size
    # now, initialize the doc from words and spaces
github chartbeat-labs / textacy / textacy / io / read.py View on Github external
filepath (str): /path/to/file on disk from which rows will be streamed
        encoding (str)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing; if 'infer', the
            first 1024 bytes of the file is analyzed, producing a best guess for
            the correct dialect
        delimiter (str): 1-character string used to separate fields in a row

    Yields:
        List[obj]: Next row, whose elements are strings and/or numbers.

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.reader
    """
    with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f:
        if dialect == 'infer':
            sniffer = compat.csv.Sniffer()
            # add pipes to the list of preferred delimiters, and put spaces last
            sniffer.preferred = [',', '\t', '|', ';', ':', ' ']
            dialect = sniffer.sniff(f.read(1024))
            f.seek(0)
        for row in compat.csv.reader(f, dialect=dialect, delimiter=delimiter):
            yield row