How to use the revscoring.features.meta.aggregators function in revscoring

To help you get started, we’ve selected a few revscoring examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github wikimedia / revscoring / revscoring / features / wikitext / features / chars.py View on Github external
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.chars = aggregators.len(
            self.datasources.text,
            name=self._name + ".chars"
        )
        "`int` : The number of characters in the text"
        self.numeric_chars = aggregators.sum(
            mappers.map(len, self.datasources.numbers),
            name=self._name + ".numeric_chars", returns=int
        )
        "`int` : The number of numeric characters in the text"
        self.whitespace_chars = aggregators.sum(
            mappers.map(len, self.datasources.whitespaces),
            name=self._name + ".whitespace_chars", returns=int
        )
        "`int` : The number of whitespace characters in the text"
        self.markup_chars = aggregators.sum(
            mappers.map(len, self.datasources.markups),
            name=self._name + ".markup_chars", returns=int
        )
        "`int` : The number of wikitext markup characters in the text"
        self.cjk_chars = aggregators.sum(
            mappers.map(len, self.datasources.cjks),
github wikimedia / revscoring / revscoring / languages / features / stopwords / features.py View on Github external
filters.positive(non_stopword_delta_values),
            name=name + ".non_stopword_delta_increase",
            returns=int
        )
        "`int` : The sum of word frequency delta increases for non-stopwords"
        self.non_stopword_delta_decrease = aggregators.sum(
            filters.negative(non_stopword_delta_values),
            name=name + ".non_stopword_delta_decrease",
            returns=int
        )
        "`int` : The sum of word frequency delta decreases for non-stopwords"

        # Proportional word frequency deltas
        stopword_prop_delta_values = \
            dicts.values(self.datasources.stopword_prop_delta)
        self.stopword_prop_delta_sum = aggregators.sum(
            stopword_prop_delta_values,
            name=name + ".stopword_prop_delta_sum"
        )
        "`float` : The sum of proportional word frequency deltas for stopwords"
        self.stopword_prop_delta_increase = aggregators.sum(
            filters.positive(stopword_prop_delta_values),
            name=name + ".stopword_prop_delta_increase"
        )
        """
        `float` : The sum of proportional word frequency delta increases for
        stopwords
        """
        self.stopword_prop_delta_decrease = aggregators.sum(
            filters.negative(stopword_prop_delta_values),
            name=name + ".stopword_prop_delta_decrease"
        )
github wikimedia / articlequality / articlequality / feature_lists / euwiki.py View on Github external
# References
revision = Revision(
    "euwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="euwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!)(.|\n))*$",
    paragraphs,
    name="euwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="euwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
    # category_links,
    # category_links / max(wikitext.revision.content_chars, 1),
    infobox_templates,
    cn_templates + 1,
    cn_templates / max(wikitext.revision.content_chars, 1),
    log(paragraphs_without_refs_total_length + 1),
    basque.dictionary.revision.dict_words,
    basque.dictionary.revision.dict_words / max(wikitext.revision.words, 1),
    english.dictionary.revision.dict_words,
github wikimedia / revscoring / revscoring / languages / features / stemmed / features.py View on Github external
def __init__(self, name, revision_datasources):
        super().__init__(name)

        self.datasources = revision_datasources

        self.unique_stems = aggregators.len(
            dicts.keys(self.datasources.stem_frequency),
            name=name + ".stems"
        )
        """
        `int` : A count of unique stemmed words.
        """

        self.stem_chars = aggregators.sum(
            mappers.map(len, self.datasources.stems),
            name=name + ".stems_length",
            returns=int
        )
        """
        `int` : A count of characters in stemmed words.
        """

        if hasattr(self.datasources, 'parent'):
            self.parent = Revision(name + ".parent", self.datasources.parent)
            """
            :class:`~revscoring.languages.features.stemmed.Revision` : The
            parent revision
            """

        if hasattr(self.datasources, 'diff'):
github wikimedia / articlequality / articlequality / feature_lists / ptwiki.py View on Github external
"Check to see if we have at least 10 words and no refs"
    words = 0
    refs = 0
    for t in segment.tokens():
        words += t.type == "word"
        refs += t.type in ("ref_open", "ref_close", "ref_singleton")
    return words > 10 and refs == 0


paragraphs_without_refs = filters.filter(
    filter_paragraphs_without_ref_tags,
    wikitext.revision.datasources.paragraphs_sentences_and_whitespace,
    name="ptwiki.revision.paragraphs_without_refs"
)

paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, mappers.map(str, paragraphs_without_refs)),
    name="ptwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    all_images,
    all_images / max(wikitext.revision.content_chars, 1),
    category_links,
    category_links / max(wikitext.revision.content_chars, 1),
    all_ref_tags,
    all_ref_tags / max(wikitext.revision.content_chars, 1),
    all_cite_templates,
    all_cite_templates / max(wikitext.revision.content_chars, 1),
    proportion_of_templated_references,
    non_templated_references,
    non_templated_references / max(wikitext.revision.content_chars, 1),
github wikimedia / editquality / editquality / feature_lists / translatewiki.py View on Github external
revision_lang_map = Datasource(
    "revision.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.text])
parent_lang_map = Datasource(
    "revision.parent.lang_map", process_normalized_lang_map,
    depends_on=[ro.revision.parent.text])
parent_lang_vector = vectorizers.vectorize(
    parent_lang_map, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.parent.lang_vector")
lang_delta = frequencies.delta(parent_lang_map, revision_lang_map)
lang_delta_vector = vectorizers.vectorize(
    lang_delta, keys=ALL_NORMALIZED_LANGS, returns=float,
    name="revision.diff.lang_delta_vector")
lang_delta_sum_diff = aggregators.sum(
    mappers.abs(dicts.values(lang_delta)),
    name="revision.diff.lang_delta_sum_diff")


def process_title_lang_match(title_lang, lang_delta):
    return lang_delta.get(title_lang, 0.0)


parent_lang_match = Feature("revision.parent.lang_match",
                            process_title_lang_match,
                            depends_on=[translation_title_lang,
                                        parent_lang_map],
                            returns=float)
match_lang_delta = Feature("revision.diff.match_lang_delta",
                           process_title_lang_match,
                           depends_on=[translation_title_lang,
github wikimedia / revscoring / revscoring / languages / features / dictionary / features.py View on Github external
def __init__(self, name, revision_datasources):
        super().__init__(name)
        self.datasources = revision_datasources

        self.dict_words = aggregators.len(self.datasources.dict_words)
        "`int` : A count of the number of dictionary words in the revision"
        self.non_dict_words = \
            aggregators.len(self.datasources.non_dict_words)
        "`int` : A count of the number of non-dictionary words in the revision"

        if hasattr(self.datasources, 'parent'):
            self.parent = Revision(name + ".parent", self.datasources.parent)
            """
            :class:`~revscoring.languages.features.dictionary.Revision` : The
            parent revision
            """

        if hasattr(self.datasources, 'diff'):
            self.diff = Diff(name + ".diff", self.datasources.diff)
            """
            :class:`~revscoring.languages.features.dictionary.Diff` : The
github wikimedia / revscoring / revscoring / features / wikitext / edit / diff / chars.py View on Github external
from itertools import groupby

from . import datasources
from .....datasources.meta import mappers
from ....feature import Feature
from ....meta import aggregators
from .util import prefix

chars_added = aggregators.sum(
    mappers.map(len, datasources.segments_added),
    name=prefix + ".chars_added", returns=int
)
"""
A count of the number of characters added in this edit.
"""

chars_removed = aggregators.sum(
    mappers.map(len, datasources.segments_removed),
    name=prefix + ".chars_removed", returns=int
)
"""
A count of the number of characters removed in this edit.
"""

numeric_chars_added = aggregators.sum(
github wikimedia / articlequality / articlequality / feature_lists / wikidatawiki.py View on Github external
unique_references_count = aggregators.len(unique_references)
"`int` : A count of unique sources in the revision"

# Status
is_human = wikibase_.revision.has_property_value(
    properties.INSTANCE_OF, items.HUMAN, name=name + '.revision.is_human')
has_birthday = wikibase_.revision.has_property(
    properties.DATE_OF_BIRTH, name=name + '.revision.has_birthday')
dead = wikibase_.revision.has_property(
    properties.DATE_OF_DEATH, name=name + '.revision.dead')
is_blp = has_birthday.and_(not_(dead))

local_wiki = [
    is_human,
    is_blp,
    aggregators.len(complete_translations),
    aggregators.len(important_label_translations),
    aggregators.len(important_description_translations),
    aggregators.len(important_complete_translations),
    references_count,
    wikimedia_references_count,
    wikimedia_references_count / modifiers.max(references_count, 1),
    external_references_count,
    external_references_count / modifiers.max(references_count, 1),
    unique_references_count,
    unique_references_count / modifiers.max(references_count, 1)
]

item_quality = wikibase.item + local_wiki
github wikimedia / revscoring / revscoring / features / wikitext / features / tokenized.py View on Github external
)
        "`int` : The sum of delta changes in the number frequency table"

        self.number_delta_increase = aggregators.sum(
            filters.positive(dicts.values(self.datasources.number_delta)),
            name=self._name + ".number_delta_increase"
        )
        "`int` : The sum of delta increases in the number frequency table"

        self.number_delta_decrease = aggregators.sum(
            filters.negative(dicts.values(self.datasources.number_delta)),
            name=self._name + ".number_delta_decrease"
        )
        "`int` : The sum of delta decreases in the number frequency table"

        self.number_prop_delta_sum = aggregators.sum(
            dicts.values(self.datasources.number_prop_delta),
            name=self._name + ".number_prop_delta_sum"
        )
        """
        `int` : The sum of proportional delta changes in the number
        frequency table
        """

        self.number_prop_delta_increase = aggregators.sum(
            filters.positive(dicts.values(self.datasources.number_prop_delta)),
            name=self._name + ".number_prop_delta_increase"
        )
        """
        `int` : The sum of proportional delta increases in the number
        frequency table
        """