Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.datasources.text,
name=self._name + ".chars"
)
"`int` : The number of characters in the text"
self.numeric_chars = aggregators.sum(
mappers.map(len, self.datasources.numbers),
name=self._name + ".numeric_chars", returns=int
)
"`int` : The number of numeric characters in the text"
self.whitespace_chars = aggregators.sum(
mappers.map(len, self.datasources.whitespaces),
name=self._name + ".whitespace_chars", returns=int
)
"`int` : The number of whitespace characters in the text"
self.markup_chars = aggregators.sum(
mappers.map(len, self.datasources.markups),
name=self._name + ".markup_chars", returns=int
)
"`int` : The number of wikitext markup characters in the text"
self.cjk_chars = aggregators.sum(
mappers.map(len, self.datasources.cjks),
name=self._name + ".cjk_chars", returns=int
)
"`int` : The number of Chinese/Japanese/Korean characters in the text"
self.entity_chars = aggregators.sum(
mappers.map(len, self.datasources.entities),
name=self._name + ".entity_chars", returns=int
)
"`int` : The number of HTML entity characters in the text"
self.url_chars = aggregators.sum(
mappers.map(len, self.datasources.urls),
name=self._name + ".url_chars", returns=int
cn_templates = wikitext.revision.template_names_matching(
r"erref[ _]behar", name="euwiki.revision.cn_templates")
# Links
# Excluding category_links based on https://phabricator.wikimedia.org/T240467
# category_links = wikitext.revision.wikilink_titles_matching(
# r"(Kategoria|Category)\:", name="euwiki.revision.category_links")
image_links = wikitext.revision.wikilink_titles_matching(
r"(File|Image|Fitxategi)\:", name="euwiki.revision.image_links")
# References
revision = Revision(
"euwiki.revision.revision",
wikitext.revision.datasources,
)
paragraphs = mappers.map(
str, revision.paragraphs_sentences_and_whitespace,
name="euwiki.revision.paragraphs"
)
paragraphs_without_refs = filters.regex_matching(
r"^(?!\s*$)((?!)(.|\n))*$",
paragraphs,
name="euwiki.revision.paragraphs_without_refs"
)
paragraphs_without_refs_total_length = aggregators.sum(
mappers.map(len, paragraphs_without_refs),
name="euwiki.revision.paragraphs_without_refs_total_length"
)
local_wiki = [
image_links,
image_links / max(wikitext.revision.content_chars, 1),
:Parameters:
strs_datasource : :class:`revscoring.Datasource`
A datasource that generates a list of `str`
name : `str`
A name for the datasource.
"""
def __init__(self, strs_datasource, name=None):
name = self._format_name(name, [strs_datasource])
super().__init__(self.no_repeat, strs_datasource, name=name)
def no_repeat(self, s):
return ''.join(char for char, group in groupby(s))
class de1337(map):
"""
Returns a :class:`revscoring.Datasource` that converts numbers in the
middle of words into the characters they are often used
to represent (e.g. "he7d3r" --> "hetder").
:Parameters:
strs_datasource : :class:`revscoring.Datasource`
A datasource that generates a list of `str`
name : `str`
A name for the datasource.
"""
MAP = {'1': "l", '3': "e", '4': "a", '5': "s",
'6': "g", '7': "t", '0': "o", "#": "h", "(": "c"}
def __init__(self, strs_datasource, name=None):
name = self._format_name(name, [strs_datasource])
from itertools import groupby
from . import datasources
from .....datasources.meta import mappers
from ....feature import Feature
from ....meta import aggregators
from .util import prefix
chars_added = aggregators.sum(
mappers.map(len, datasources.segments_added),
name=prefix + ".chars_added", returns=int
)
"""
A count of the number of characters added in this edit.
"""
chars_removed = aggregators.sum(
mappers.map(len, datasources.segments_removed),
name=prefix + ".chars_removed", returns=int
)
"""
A count of the number of characters removed in this edit.
"""
numeric_chars_added = aggregators.sum(
mappers.map(len, datasources.numbers_added),
name=self._name + ".external_link_url"
)
"""
A list of external link urls
"""
self.wikilinks = get_key(
mwparserfromhell.nodes.Wikilink, self.node_class_map,
default=[],
name=self._name + ".wikilinks"
)
"""
A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s
"""
self.wikilink_titles = mappers.map(
_extract_wikilink_title, self.wikilinks,
name=self._name + ".wikilink_titles"
)
"""
Returns a list of string titles of internal links (aka "targets")
"""
self.tags = get_key(
mwparserfromhell.nodes.Tag, self.node_class_map,
default=[],
name=self._name + ".tags"
)
"""
A list of :class:`mwparserfromhell.nodes.heading.Tag`'s
"""
def __init__(self, name, stem_word, wikitext_diff, revision):
super().__init__(name)
self.stems_added = mappers.map(
stem_word, wikitext_diff.words_added,
name=name + ".stems_added"
)
self.stems_removed = mappers.map(
stem_word, wikitext_diff.words_removed,
name=name + ".stems_removed"
)
self.stem_delta = frequencies.delta(
revision.parent.stem_frequency,
revision.stem_frequency,
name=name + ".stem_delta"
)
self.stem_prop_delta = frequencies.prop_delta(
revision.parent.stem_frequency, self.stem_delta,
name=name + ".stem_prop_delta"
)
name=prefix + ".url_chars_added", returns=int
)
"""
A count of the number of url characters added in this edit.
"""
url_chars_removed = aggregators.sum(
mappers.map(len, datasources.urls_removed),
name=prefix + ".url_chars_removed", returns=int
)
"""
A count of the number of url characters removed in this edit.
"""
word_chars_added = aggregators.sum(
mappers.map(len, datasources.words_added),
name=prefix + ".word_chars_added", returns=int
)
"""
A count of the number of word characters added in this edit.
"""
word_chars_removed = aggregators.sum(
mappers.map(len, datasources.words_removed),
name=prefix + ".word_chars_removed", returns=int
)
"""
A count of the number of word characters removed in this edit.
"""
uppercase_word_chars_added = aggregators.sum(
mappers.map(len, datasources.uppercase_words_added),
def __init__(self, name, revision_datasources):
super().__init__(name)
self.datasources = revision_datasources
self.unique_stems = aggregators.len(
dicts.keys(self.datasources.stem_frequency),
name=name + ".stems"
)
"""
`int` : A count of unique stemmed words.
"""
self.stem_chars = aggregators.sum(
mappers.map(len, self.datasources.stems),
name=name + ".stems_length",
returns=int
)
"""
`int` : A count of characters in stemmed words.
"""
if hasattr(self.datasources, 'parent'):
self.parent = Revision(name + ".parent", self.datasources.parent)
"""
:class:`~revscoring.languages.features.stemmed.Revision` : The
parent revision
"""
if hasattr(self.datasources, 'diff'):
self.diff = Diff(name + ".diff", self.datasources.diff)
"`int` : The number of punctuation characters added"
self.punctuation_chars_removed = aggregators.sum(
mappers.map(len, self.datasources.punctuations_removed),
name=self._name + ".punctuation_chars_removed", returns=int
)
"`int` : The number of punctuation characters removed"
self.break_chars_added = aggregators.sum(
mappers.map(len, self.datasources.breaks_added),
name=self._name + ".break_chars_added", returns=int
)
"`int` : The number of break characters added"
self.break_chars_removed = aggregators.sum(
mappers.map(len, self.datasources.breaks_removed),
name=self._name + ".break_chars_removed", returns=int
)
"`int` : The number of break characters removed"
self.longest_repeated_char_added = \
Feature(self._name + ".longest_repeated_char_added",
_process_longest_repeated_char_added,
returns=int, depends_on=[self.datasources.segments_added])
"`int` : The most repeated character added"
name=prefix + ".cjk_chars_removed", returns=int
)
"""
A count of the number of cjk characters removed in this edit.
"""
entity_chars_added = aggregators.sum(
mappers.map(len, datasources.entities_added),
name=prefix + ".entity_chars_added", returns=int
)
"""
A count of the number of entity characters added in this edit.
"""
entity_chars_removed = aggregators.sum(
mappers.map(len, datasources.entities_removed),
name=prefix + ".entity_chars_removed", returns=int
)
"""
A count of the number of entity characters removed in this edit.
"""
url_chars_added = aggregators.sum(
mappers.map(len, datasources.urls_added),
name=prefix + ".url_chars_added", returns=int
)
"""
A count of the number of url characters added in this edit.
"""
url_chars_removed = aggregators.sum(
mappers.map(len, datasources.urls_removed),