How to use the revscoring.datasources.meta.filters.filter function in revscoring

To help you get started, we’ve selected a few revscoring examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github wikimedia / revscoring / revscoring / features / wikitext / datasources / edit.py View on Github external
self.words_removed = self.tokens_removed_in_types(
            {'word'}, name=self._name + ".words_removed"
        )
        """
        A list of word tokens removed in the edit
        """

        self.uppercase_words_added = filters.filter(
            is_uppercase_word, self.words_added,
            name=self._name + ".uppercase_words_added"
        )
        """
        A list of fully UPPERCASE word tokens added in the edit
        """

        self.uppercase_words_removed = filters.filter(
            is_uppercase_word, self.words_removed,
            name=self._name + ".uppercase_words_removed"
        )
        """
        A list of fully UPPERCASE word tokens removed in the edit
        """

        self.punctuations_added = self.tokens_added_in_types(
            {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon',
             'japan_punct'},
            name=self._name + ".punctuations_added"
        )
        """
        A list of punctuation tokens added in the edit
        """
github wikimedia / revscoring / revscoring / languages / features / dictionary / datasources.py View on Github external
self.dictionary_check = dictionary_check

        # Based on edit.diff
        self.dict_words_added = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_added),
            name=name + ".dict_words_added"
        )
        self.dict_words_removed = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_removed),
            name=name + ".dict_words_removed"
        )
        self.non_dict_words_added = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_added),
            name=name + ".non_dict_words_added", inverse=True
        )
        self.non_dict_words_removed = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_removed),
            name=name + ".non_dict_words_removed", inverse=True
        )

        # Frequencies
        self.dict_word_delta = frequencies.delta(
            revision.parent.dict_word_frequency,
            revision.dict_word_frequency,
            name=name + ".dict_word_delta"
        )
        self.non_dict_word_delta = frequencies.delta(
            revision.parent.non_dict_word_frequency,
            revision.non_dict_word_frequency,
            name=name + ".non_dict_word_delta"
        )
github wikimedia / revscoring / revscoring / languages / features / stopwords / datasources.py View on Github external
self.is_stopword = is_stopword

        # Based on edit.diff
        self.stopwords_added = filters.filter(
            is_stopword, wikitext_diff.words_added,
            name=name + ".diff.stopwords_added"
        )
        self.stopwords_removed = filters.filter(
            is_stopword, wikitext_diff.words_removed,
            name=name + ".diff.stopwords_removed"
        )
        self.non_stopwords_added = filters.filter(
            is_stopword, wikitext_diff.words_added,
            name=name + ".diff.non_stopwords_added", inverse=True
        )
        self.non_stopwords_removed = filters.filter(
            is_stopword, wikitext_diff.words_removed,
            name=name + ".diff.non_stopwords_removed", inverse=True
        )

        # Frequencies
        self.stopword_delta = frequencies.delta(
            revision.parent.stopword_frequency,
            revision.stopword_frequency,
            name=name + ".diff.stopword_delta"
        )
        self.non_stopword_delta = frequencies.delta(
            revision.parent.non_stopword_frequency,
            revision.non_stopword_frequency,
            name=name + ".diff.non_stopword_delta"
        )
github wikimedia / revscoring / revscoring / features / wikitext / datasources / edit.py View on Github external
def tokens_removed_in_types(self, types, name=None):
        """
        Constructs a :class:`revscoring.Datasource` that represents tokens
        removed that are within a set of types.
        """
        types = set(types)
        if name is None:
            name = "{0}({1})".format(self._name + ".tokens_removed_in_types",
                                     types)
        return filters.filter(TokenIsInTypes(types).filter,
                              self.tokens_removed, name=name)
github wikimedia / revscoring / revscoring / features / wikitext / datasources / edit.py View on Github external
def tokens_added_in_types(self, types, name=None):
        """
        Constructs a :class:`revscoring.Datasource` that represents tokens
        added that are within a set of types.
        """
        types = set(types)
        if name is None:
            name = "{0}({1})".format(self._name + ".tokens_added_in_types",
                                     types)
        return filters.filter(TokenIsInTypes(types).filter, self.tokens_added,
                              name=name)
github wikimedia / revscoring / revscoring / datasources / meta / filters.py View on Github external
:Parameters:
        numbers_datasource : :class:`revscoring.Datasource`
            A datasource that generates the subset of numbers that are positive
        name : `str`
            A name for the datasource.
    """

    def __init__(self, numbers_datasource, name=None):
        name = self._format_name(name, [numbers_datasource])
        super().__init__(self.is_positive, numbers_datasource, name=name)

    def is_positive(self, v):
        return v > 0


class negative(filter):
    """
    Generates a filtered list of negative numbers from a list of numbers.

    :Parameters:
        numbers_datasource : :class:`revscoring.Datasource`
            A datasource that generates the subset of numbers that are negative
        name : `str`
            A name for the datasource.
    """

    def __init__(self, numbers_datasource, name=None):
        name = self._format_name(name, [numbers_datasource])
        super().__init__(self.is_negative, numbers_datasource, name=name)

    def is_negative(self, v):
        return v < 0
github wikimedia / revscoring / revscoring / languages / features / dictionary / datasources.py View on Github external
def __init__(self, name, dictionary_check, wikitext_diff, revision):
        super().__init__(name)
        self.dictionary_check = dictionary_check

        # Based on edit.diff
        self.dict_words_added = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_added),
            name=name + ".dict_words_added"
        )
        self.dict_words_removed = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_removed),
            name=name + ".dict_words_removed"
        )
        self.non_dict_words_added = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_added),
            name=name + ".non_dict_words_added", inverse=True
        )
        self.non_dict_words_removed = filters.filter(
            dictionary_check, mappers.map(str, wikitext_diff.words_removed),
            name=name + ".non_dict_words_removed", inverse=True
        )

        # Frequencies
        self.dict_word_delta = frequencies.delta(
            revision.parent.dict_word_frequency,
            revision.dict_word_frequency,
github wikimedia / revscoring / revscoring / languages / features / stopwords / datasources.py View on Github external
def __init__(self, name, is_stopword, wikitext_diff, revision):
        super().__init__(name)
        self.is_stopword = is_stopword

        # Based on edit.diff
        self.stopwords_added = filters.filter(
            is_stopword, wikitext_diff.words_added,
            name=name + ".diff.stopwords_added"
        )
        self.stopwords_removed = filters.filter(
            is_stopword, wikitext_diff.words_removed,
            name=name + ".diff.stopwords_removed"
        )
        self.non_stopwords_added = filters.filter(
            is_stopword, wikitext_diff.words_added,
            name=name + ".diff.non_stopwords_added", inverse=True
        )
        self.non_stopwords_removed = filters.filter(
            is_stopword, wikitext_diff.words_removed,
            name=name + ".diff.non_stopwords_removed", inverse=True
        )

        # Frequencies
        self.stopword_delta = frequencies.delta(
            revision.parent.stopword_frequency,
            revision.stopword_frequency,
github wikimedia / articlequality / articlequality / feature_lists / ptwiki.py View on Github external
images_in_templates + (side_by_side_image_templates * 2) + \
    images_in_tags + infobox_images


# References
def filter_paragraphs_without_ref_tags(segment):
    "Check to see if we have at least 10 words and no refs"
    words = 0
    refs = 0
    for t in segment.tokens():
        words += t.type == "word"
        refs += t.type in ("ref_open", "ref_close", "ref_singleton")
    return words > 10 and refs == 0


paragraphs_without_refs = filters.filter(
    filter_paragraphs_without_ref_tags,
    wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="ptwiki.revision.paragraphs_without_refs"
)

paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, mappers.map(str, paragraphs_without_refs)),
    name="ptwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    all_images,
    all_images / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    all_ref_tags,