How to use the textacy.utils.validate_set_members function in textacy

To help you get started, we’ve selected a few textacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github chartbeat-labs / textacy / textacy / datasets / wikimedia.py View on Github external
def _get_filters(self, category, wiki_link, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if category is not None:
            category = utils.validate_set_members(
                category, (str, bytes), valid_vals=None)
            filters.append(
                lambda record: (
                    record.get("categories")
                    and any(ctgry in record["categories"] for ctgry in category)
                )
            )
        if wiki_link is not None:
            wiki_link = utils.validate_set_members(
                wiki_link, (str, bytes), valid_vals=None)
            filters.append(
                lambda record: (
                    record.get("wiki_links")
                    and any(wl in record["wiki_links"] for wl in wiki_link)
                )
            )
github chartbeat-labs / textacy / textacy / datasets / reddit_comments.py View on Github external
def _get_filters(self, subreddit, date_range, score_range, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("body", "")) >= min_len
            )
        if subreddit is not None:
            subreddit = utils.validate_set_members(subreddit, (str, bytes))
            filters.append(
                lambda record: record.get("subreddit") in subreddit
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("created_utc")
                    and date_range[0] <= record["created_utc"] < date_range[1]
                )
            )
        if score_range is not None:
            score_range = utils.validate_and_clip_range(
                score_range, self._full_score_range, val_type=(int, float))
            filters.append(
github chartbeat-labs / textacy / textacy / datasets / capitol_words.py View on Github external
)
            )
        if speaker_name is not None:
            speaker_name = utils.validate_set_members(
                speaker_name, (str, bytes), valid_vals=self.speaker_names)
            filters.append(lambda record: record.get("speaker_name") in speaker_name)
        if speaker_party is not None:
            speaker_party = utils.validate_set_members(
                speaker_party, (str, bytes), valid_vals=self.speaker_parties)
            filters.append(lambda record: record.get("speaker_party") in speaker_party)
        if chamber is not None:
            chamber = utils.validate_set_members(
                chamber, (str, bytes), valid_vals=self.chambers)
            filters.append(lambda record: record.get("chamber") in chamber)
        if congress is not None:
            congress = utils.validate_set_members(
                congress, int, valid_vals=self.congresses)
            filters.append(lambda record: record.get("congress") in congress)
        return filters
github chartbeat-labs / textacy / textacy / datasets / oxford_text_archive.py View on Github external
def _get_filters(self, author, date_range, min_len):
        filters = []
        if min_len is not None:
            if min_len < 1:
                raise ValueError("`min_len` must be at least 1")
            filters.append(
                lambda record: len(record.get("text", "")) >= min_len
            )
        if author is not None:
            author = utils.validate_set_members(
                author, (str, bytes), valid_vals=self.authors)
            filters.append(
                lambda record: record.get("author") and any(athr in author for athr in record["author"])
            )
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: record.get("year") and date_range[0] <= record["year"] < date_range[1]
            )
        return filters
github chartbeat-labs / textacy / textacy / datasets / capitol_words.py View on Github external
)
        if date_range is not None:
            date_range = utils.validate_and_clip_range(
                date_range, self.full_date_range, val_type=(str, bytes))
            filters.append(
                lambda record: (
                    record.get("date")
                    and date_range[0] <= record["date"] < date_range[1]
                )
            )
        if speaker_name is not None:
            speaker_name = utils.validate_set_members(
                speaker_name, (str, bytes), valid_vals=self.speaker_names)
            filters.append(lambda record: record.get("speaker_name") in speaker_name)
        if speaker_party is not None:
            speaker_party = utils.validate_set_members(
                speaker_party, (str, bytes), valid_vals=self.speaker_parties)
            filters.append(lambda record: record.get("speaker_party") in speaker_party)
        if chamber is not None:
            chamber = utils.validate_set_members(
                chamber, (str, bytes), valid_vals=self.chambers)
            filters.append(lambda record: record.get("chamber") in chamber)
        if congress is not None:
            congress = utils.validate_set_members(
                congress, int, valid_vals=self.congresses)
            filters.append(lambda record: record.get("congress") in congress)
        return filters