How to use the textacy.utils.validate_and_clip_range function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / datasets / oxford_text_archive.py View on Github external
def _get_filters(self, author, date_range, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if author is not None:
            author = utils.validate_set_members(
                author, (str, bytes), valid_vals=self.authors)
            filters.append(
                lambda record: record.get("author") and any(athr in author for athr in record["author"])
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: record.get("year") and date_range[0] <= record["year"] < date_range[1]
            )
        return filters
github chartbeat-labs / textacy / textacy / datasets / reddit_comments.py View on Github external
def _get_filters(self, subreddit, date_range, score_range, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("body", "")) >= min_len
            )
        if subreddit is not None:
            subreddit = utils.validate_set_members(subreddit, (str, bytes))
            filters.append(
                lambda record: record.get("subreddit") in subreddit
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("created_utc")
                    and date_range[0] <= record["created_utc"] < date_range[1]
                )
            )
        if score_range is not None:
            score_range = utils.validate_and_clip_range(
                score_range, self._full_score_range, val_type=(int, float))
            filters.append(
                lambda record: (
                    record.get("score")
                    and score_range[0] <= record["score"] < score_range[1]
                )
            )
github chartbeat-labs / textacy / textacy / datasets / capitol_words.py View on Github external
speaker_name,
        speaker_party,
        chamber,
        congress,
        date_range,
        min_len,
    ):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("date")
                    and date_range[0] <= record["date"] < date_range[1]
                )
            )
        if speaker_name is not None:
            speaker_name = utils.validate_set_members(
                speaker_name, (str, bytes), valid_vals=self.speaker_names)
            filters.append(lambda record: record.get("speaker_name") in speaker_name)
        if speaker_party is not None:
            speaker_party = utils.validate_set_members(
                speaker_party, (str, bytes), valid_vals=self.speaker_parties)
            filters.append(lambda record: record.get("speaker_party") in speaker_party)
        if chamber is not None:
github chartbeat-labs / textacy / textacy / datasets / supreme_court.py View on Github external
self,
        opinion_author,
        decision_direction,
        issue_area,
        date_range,
        min_len,
    ):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("decision_date")
                    and date_range[0] <= record["decision_date"] < date_range[1]
                )
            )
        if opinion_author is not None:
            opinion_author = utils.validate_set_members(
                opinion_author, int, valid_vals=self.opinion_author_codes)
            filters.append(
                lambda record: record.get("maj_opinion_author") in opinion_author)
        if decision_direction is not None:
            decision_direction = utils.validate_set_members(
                decision_direction, (str, bytes), valid_vals=self.decision_directions)
            filters.append(
github chartbeat-labs / textacy / textacy / datasets / imdb.py View on Github external
def _get_filters(self, rating_range, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if rating_range is not None:
            rating_range = utils.validate_and_clip_range(
                rating_range, self.full_rating_range, val_type=int)
            filters.append(
                lambda record: (
                    record.get("rating")
                    and rating_range[0] <= record["rating"] < rating_range[1]
                )
            )
        return filters
github chartbeat-labs / textacy / textacy / datasets / reddit_comments.py View on Github external
def download(self, *, date_range=(None, None), force=False):
        """
        Download 1 or more monthly Reddit comments files from archive.org
        and save them to disk under the ``data_dir`` directory.

        Args:
            date_range (Tuple[str]): Interval specifying the [start, end) dates
                for which comments files will be downloaded. Each item must be
                a str formatted as YYYY-MM or YYYY-MM-DD (the latter is converted
                to the corresponding YYYY-MM value). Both start and end values
                must be specified, but a null value for either is automatically
                replaced by the minimum or maximum valid values, respectively.
            force (bool): If True, download the dataset, even if it already
                exists on disk under ``data_dir``.
        """
        date_range = utils.validate_and_clip_range(
            date_range, self.full_date_range, val_type=(str, bytes))
        filestubs = self._generate_filestubs(date_range)
        for filestub in filestubs:
            tio.download_file(
                urllib.parse.urljoin(DOWNLOAD_ROOT, filestub),
                filename=filestub,
                dirpath=self.data_dir,
                force=force,
            )
github chartbeat-labs / textacy / textacy / datasets / reddit_comments.py View on Github external
if subreddit is not None:
            subreddit = utils.validate_set_members(subreddit, (str, bytes))
            filters.append(
                lambda record: record.get("subreddit") in subreddit
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("created_utc")
                    and date_range[0] <= record["created_utc"] < date_range[1]
                )
            )
        if score_range is not None:
            score_range = utils.validate_and_clip_range(
                score_range, self._full_score_range, val_type=(int, float))
            filters.append(
                lambda record: (
                    record.get("score")
                    and score_range[0] <= record["score"] < score_range[1]
                )
            )
        return filters