How to use the revscoring.datasources.meta.mappers.map function in revscoring

To help you get started, we’ve selected a few revscoring examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github wikimedia / revscoring / revscoring / features / wikitext / features / chars.py View on Github external
self.datasources.text,
            name=self._name + ".chars"
        )
        "`int` : The number of characters in the text"
        self.numeric_chars = aggregators.sum(
            mappers.map(len, self.datasources.numbers),
            name=self._name + ".numeric_chars", returns=int
        )
        "`int` : The number of numeric characters in the text"
        self.whitespace_chars = aggregators.sum(
            mappers.map(len, self.datasources.whitespaces),
            name=self._name + ".whitespace_chars", returns=int
        )
        "`int` : The number of whitespace characters in the text"
        self.markup_chars = aggregators.sum(
            mappers.map(len, self.datasources.markups),
            name=self._name + ".markup_chars", returns=int
        )
        "`int` : The number of wikitext markup characters in the text"
        self.cjk_chars = aggregators.sum(
            mappers.map(len, self.datasources.cjks),
            name=self._name + ".cjk_chars", returns=int
        )
        "`int` : The number of Chinese/Japanese/Korean characters in the text"
        self.entity_chars = aggregators.sum(
            mappers.map(len, self.datasources.entities),
            name=self._name + ".entity_chars", returns=int
        )
        "`int` : The number of HTML entity characters in the text"
        self.url_chars = aggregators.sum(
            mappers.map(len, self.datasources.urls),
            name=self._name + ".url_chars", returns=int
github wikimedia / articlequality / articlequality / feature_lists / euwiki.py View on Github external
cn_templates = wikitext.revision.template_names_matching(
    r"erref[ _]behar", name="euwiki.revision.cn_templates")

# Links
# Excluding category_links based on https://phabricator.wikimedia.org/T240467
# category_links = wikitext.revision.wikilink_titles_matching(
#    r"(Kategoria|Category)\:", name="euwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
    r"(File|Image|Fitxategi)\:", name="euwiki.revision.image_links")

# References
revision = Revision(
    "euwiki.revision.revision",
    wikitext.revision.datasources,
)
paragraphs = mappers.map(
    str, revision.paragraphs_sentences_and_whitespace,
    name="euwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
    r"^(?!\s*$)((?!)(.|\n))*$",
    paragraphs,
    name="euwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
    mappers.map(len, paragraphs_without_refs),
    name="euwiki.revision.paragraphs_without_refs_total_length"
)

local_wiki = [
    image_links,
    image_links / max(wikitext.revision.content_chars, 1),
github wikimedia / revscoring / revscoring / datasources / meta / mappers.py View on Github external
:Parameters:
        strs_datasource : :class:`revscoring.Datasource`
            A datasource that generates a list of `str`
        name : `str`
            A name for the datasource.
    """

    def __init__(self, strs_datasource, name=None):
        name = self._format_name(name, [strs_datasource])
        super().__init__(self.no_repeat, strs_datasource, name=name)

    def no_repeat(self, s):
        return ''.join(char for char, group in groupby(s))


class de1337(map):
    """
    Returns a :class:`revscoring.Datasource` that converts numbers in the
    middle of words into the characters they are often used
    to represent (e.g. "he7d3r" --> "hetder").

    :Parameters:
        strs_datasource : :class:`revscoring.Datasource`
            A datasource that generates a list of `str`
        name : `str`
            A name for the datasource.
    """
    MAP = {'1': "l", '3': "e", '4': "a", '5': "s",
           '6': "g", '7': "t", '0': "o", "#": "h", "(": "c"}

    def __init__(self, strs_datasource, name=None):
        name = self._format_name(name, [strs_datasource])
github wikimedia / revscoring / revscoring / features / wikitext / edit / diff / chars.py View on Github external
from itertools import groupby

from . import datasources
from .....datasources.meta import mappers
from ....feature import Feature
from ....meta import aggregators
from .util import prefix

chars_added = aggregators.sum(
    mappers.map(len, datasources.segments_added),
    name=prefix + ".chars_added", returns=int
)
"""
A count of the number of characters added in this edit.
"""

chars_removed = aggregators.sum(
    mappers.map(len, datasources.segments_removed),
    name=prefix + ".chars_removed", returns=int
)
"""
A count of the number of characters removed in this edit.
"""

numeric_chars_added = aggregators.sum(
    mappers.map(len, datasources.numbers_added),
github wikimedia / revscoring / revscoring / features / wikitext / datasources / parsed.py View on Github external
name=self._name + ".external_link_url"
        )
        """
        A list of external link urls
        """

        self.wikilinks = get_key(
            mwparserfromhell.nodes.Wikilink, self.node_class_map,
            default=[],
            name=self._name + ".wikilinks"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s
        """

        self.wikilink_titles = mappers.map(
            _extract_wikilink_title, self.wikilinks,
            name=self._name + ".wikilink_titles"
        )
        """
        Returns a list of string titles of internal links (aka "targets")
        """

        self.tags = get_key(
            mwparserfromhell.nodes.Tag, self.node_class_map,
            default=[],
            name=self._name + ".tags"
        )
        """
        A list of :class:`mwparserfromhell.nodes.heading.Tag`'s
        """
github wikimedia / revscoring / revscoring / languages / features / stemmed / datasources.py View on Github external
def __init__(self, name, stem_word, wikitext_diff, revision):
        super().__init__(name)

        self.stems_added = mappers.map(
            stem_word, wikitext_diff.words_added,
            name=name + ".stems_added"
        )
        self.stems_removed = mappers.map(
            stem_word, wikitext_diff.words_removed,
            name=name + ".stems_removed"
        )

        self.stem_delta = frequencies.delta(
            revision.parent.stem_frequency,
            revision.stem_frequency,
            name=name + ".stem_delta"
        )
        self.stem_prop_delta = frequencies.prop_delta(
            revision.parent.stem_frequency, self.stem_delta,
            name=name + ".stem_prop_delta"
        )
github wikimedia / revscoring / revscoring / features / wikitext / edit / diff / chars.py View on Github external
name=prefix + ".url_chars_added", returns=int
)
"""
A count of the number of url characters added in this edit.
"""

url_chars_removed = aggregators.sum(
    mappers.map(len, datasources.urls_removed),
    name=prefix + ".url_chars_removed", returns=int
)
"""
A count of the number of url characters removed in this edit.
"""

word_chars_added = aggregators.sum(
    mappers.map(len, datasources.words_added),
    name=prefix + ".word_chars_added", returns=int
)
"""
A count of the number of word characters added in this edit.
"""

word_chars_removed = aggregators.sum(
    mappers.map(len, datasources.words_removed),
    name=prefix + ".word_chars_removed", returns=int
)
"""
A count of the number of word characters removed in this edit.
"""

uppercase_word_chars_added = aggregators.sum(
    mappers.map(len, datasources.uppercase_words_added),
github wikimedia / revscoring / revscoring / languages / features / stemmed / features.py View on Github external
def __init__(self, name, revision_datasources):
        super().__init__(name)

        self.datasources = revision_datasources

        self.unique_stems = aggregators.len(
            dicts.keys(self.datasources.stem_frequency),
            name=name + ".stems"
        )
        """
        `int` : A count of unique stemmed words.
        """

        self.stem_chars = aggregators.sum(
            mappers.map(len, self.datasources.stems),
            name=name + ".stems_length",
            returns=int
        )
        """
        `int` : A count of characters in stemmed words.
        """

        if hasattr(self.datasources, 'parent'):
            self.parent = Revision(name + ".parent", self.datasources.parent)
            """
            :class:`~revscoring.languages.features.stemmed.Revision` : The
            parent revision
            """

        if hasattr(self.datasources, 'diff'):
            self.diff = Diff(name + ".diff", self.datasources.diff)
github wikimedia / revscoring / revscoring / features / wikitext / features / chars.py View on Github external
"`int` : The number of punctuation characters added"

        self.punctuation_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.punctuations_removed),
            name=self._name + ".punctuation_chars_removed", returns=int
        )
        "`int` : The number of punctuation characters removed"

        self.break_chars_added = aggregators.sum(
            mappers.map(len, self.datasources.breaks_added),
            name=self._name + ".break_chars_added", returns=int
        )
        "`int` : The number of break characters added"

        self.break_chars_removed = aggregators.sum(
            mappers.map(len, self.datasources.breaks_removed),
            name=self._name + ".break_chars_removed", returns=int
        )
        "`int` : The number of break characters removed"

        self.longest_repeated_char_added = \
            Feature(self._name + ".longest_repeated_char_added",
                    _process_longest_repeated_char_added,
                    returns=int, depends_on=[self.datasources.segments_added])
        "`int` : The most repeated character added"
github wikimedia / revscoring / revscoring / features / wikitext / edit / diff / chars.py View on Github external
name=prefix + ".cjk_chars_removed", returns=int
)
"""
A count of the number of cjk characters removed in this edit.
"""

entity_chars_added = aggregators.sum(
    mappers.map(len, datasources.entities_added),
    name=prefix + ".entity_chars_added", returns=int
)
"""
A count of the number of entity characters added in this edit.
"""

entity_chars_removed = aggregators.sum(
    mappers.map(len, datasources.entities_removed),
    name=prefix + ".entity_chars_removed", returns=int
)
"""
A count of the number of entity characters removed in this edit.
"""

url_chars_added = aggregators.sum(
    mappers.map(len, datasources.urls_added),
    name=prefix + ".url_chars_added", returns=int
)
"""
A count of the number of url characters added in this edit.
"""

url_chars_removed = aggregators.sum(
    mappers.map(len, datasources.urls_removed),