How to use the textacy.utils function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / datasets / supreme_court.py View on Github external
record.get("decision_date")
                    and date_range[0] <= record["decision_date"] < date_range[1]
                )
            )
        if opinion_author is not None:
            opinion_author = utils.validate_set_members(
                opinion_author, int, valid_vals=self.opinion_author_codes)
            filters.append(
                lambda record: record.get("maj_opinion_author") in opinion_author)
        if decision_direction is not None:
            decision_direction = utils.validate_set_members(
                decision_direction, (str, bytes), valid_vals=self.decision_directions)
            filters.append(
                lambda record: record.get("decision_direction") in decision_direction)
        if issue_area is not None:
            issue_area = utils.validate_set_members(
                issue_area, int, valid_vals=self.issue_area_codes)
            filters.append(
                lambda record: record.get("issue_area") in issue_area)
        return filters
github chartbeat-labs / textacy / textacy / spacy_pipelines.py View on Github external
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import logging

from . import utils

LOGGER = logging.getLogger(__name__)

utils.deprecated(
    "The `spacy_pipelines` module is deprecated and will be removed in v0.7.0."
    "Use the `textacy.spacier` subpackage instead.",
    action="once",
)


def _merge_entities(doc):
    """
    Merge named entities *in-place* within parent ``doc`` so that each becomes
    a single token.

    Args:
        doc (``spacy.doc``)
    """
    for ent in doc.ents:
        try:
github chartbeat-labs / textacy / scripts / train_lang_identifier.py View on Github external
a "uniformly sampled" collection of ~120k tweets over all languages and
    a "recall oriented" collection of ~1.5k tweets per language --
    then fetch available tweets' data from the Twitter API.

    Args:
        dirpath (str or :class:`pathlib.Path`)
        creds_fpath (str or :class:`pathlib.Path`)
        force (bool)

    References:
        https://blog.twitter.com/engineering/en_us/a/2015/evaluating-language-identification-performance.html

    TODO: Ideally, use a tweet search endpoint and filter by language,
    then just iterate over all ISO-639-1 language codes.
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    url_fnames = [
        (
            "https://raw.githubusercontent.com/mitjat/langid_eval/master/uniformly_sampled.tsv",
            "uniformly_sampled.tsv",
        ),
        (
            "https://raw.githubusercontent.com/mitjat/langid_eval/master/recall_oriented.tsv",
            "recall_oriented.tsv",
        )
    ]
    # download tweet ids first
    for url, fname in url_fnames:
        textacy.io.download_file(url, filename=fname, dirpath=dirpath, force=force)
    # download full tweets data next
    tweets_fpath = dirpath.joinpath("tweets.jsonl")
    if tweets_fpath.is_file() and force is False:
github chartbeat-labs / textacy / textacy / spacy_utils.py View on Github external
from __future__ import absolute_import, division, print_function, unicode_literals

import itertools
import logging

from spacy.symbols import NOUN, PROPN, VERB
from spacy.tokens.token import Token as SpacyToken
from spacy.tokens.span import Span as SpacySpan

from . import constants
from . import text_utils
from . import utils

LOGGER = logging.getLogger(__name__)

utils.deprecated(
    "The `spacy_utils` module is deprecated and will be removed in v0.7.0."
    "Use the `textacy.spacier` subpackage instead.",
    action="once",
)


def is_plural_noun(token):
    """
    Returns True if token is a plural noun, False otherwise.

    Args:
        token (``spacy.Token``): parent document must have POS information

    Returns:
        bool
    """